From e8d9ed89559ad33167d23e562b52d18bb0c9f817 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Fri, 20 Sep 2013 19:19:52 +0400 Subject: [PATCH] ocl: split initialization.cpp into 3 files: context, operations, programcache --- modules/nonfree/src/surf.ocl.cpp | 2 +- modules/ocl/include/opencv2/ocl/ocl.hpp | 148 ++- .../ocl/include/opencv2/ocl/private/util.hpp | 242 ++-- modules/ocl/perf/main.cpp | 28 +- modules/ocl/src/arithm.cpp | 122 +- modules/ocl/src/bgfg_mog.cpp | 4 +- modules/ocl/src/binarycaching.hpp | 55 +- modules/ocl/src/brute_force_matcher.cpp | 8 +- modules/ocl/src/canny.cpp | 8 +- modules/ocl/src/cl_context.cpp | 507 ++++++++ modules/ocl/src/cl_operations.cpp | 434 +++++++ modules/ocl/src/cl_programcache.cpp | 311 +++++ modules/ocl/src/error.cpp | 16 +- modules/ocl/src/fft.cpp | 44 +- modules/ocl/src/filtering.cpp | 2 +- modules/ocl/src/gemm.cpp | 2 +- modules/ocl/src/gftt.cpp | 2 +- modules/ocl/src/haar.cpp | 14 +- modules/ocl/src/hog.cpp | 14 +- modules/ocl/src/imgproc.cpp | 38 +- modules/ocl/src/initialization.cpp | 1090 ----------------- modules/ocl/src/knearest.cpp | 14 +- modules/ocl/src/matrix_operations.cpp | 13 +- modules/ocl/src/mcwutil.cpp | 24 +- modules/ocl/src/moments.cpp | 4 +- modules/ocl/src/pyrdown.cpp | 4 +- modules/ocl/src/pyrlk.cpp | 4 +- modules/ocl/src/pyrup.cpp | 4 +- modules/ocl/src/split_merge.cpp | 4 +- modules/ocl/src/stereo_csbp.cpp | 36 +- modules/ocl/src/stereobp.cpp | 5 +- modules/ocl/src/tvl1flow.cpp | 2 +- modules/ocl/test/main.cpp | 33 +- modules/superres/perf/perf_superres_ocl.cpp | 3 - modules/superres/src/btv_l1_ocl.cpp | 2 +- modules/superres/test/test_superres.cpp | 2 - 36 files changed, 1705 insertions(+), 1540 deletions(-) create mode 100644 modules/ocl/src/cl_context.cpp create mode 100644 modules/ocl/src/cl_operations.cpp create mode 100644 modules/ocl/src/cl_programcache.cpp delete mode 100644 modules/ocl/src/initialization.cpp diff --git a/modules/nonfree/src/surf.ocl.cpp b/modules/nonfree/src/surf.ocl.cpp index f8c1ad7294..59eab705d6 100644 --- a/modules/nonfree/src/surf.ocl.cpp +++ b/modules/nonfree/src/surf.ocl.cpp @@ -74,7 +74,7 @@ namespace cv } cl_kernel kernel; kernel = openCLGetKernelFromSource(clCxt, source, kernelName, optBufPtr); - size_t wave_size = queryDeviceInfo(kernel); + size_t wave_size = queryWaveFrontSize(kernel); CV_Assert(clReleaseKernel(kernel) == CL_SUCCESS); sprintf(optBufPtr, "-D WAVE_SIZE=%d", static_cast(wave_size)); openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, optBufPtr); diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp index c296f57a3d..21bb607471 100644 --- a/modules/ocl/include/opencv2/ocl/ocl.hpp +++ b/modules/ocl/include/opencv2/ocl/ocl.hpp @@ -57,8 +57,7 @@ namespace cv { namespace ocl { - using std::auto_ptr; - enum + enum DeviceType { CVCL_DEVICE_TYPE_DEFAULT = (1 << 0), CVCL_DEVICE_TYPE_CPU = (1 << 1), @@ -93,77 +92,113 @@ namespace cv //return -1 if the target type is unsupported, otherwise return 0 CV_EXPORTS int setDevMemType(DevMemRW rw_type = DEVICE_MEM_R_W, DevMemType mem_type = DEVICE_MEM_DEFAULT); - //this class contains ocl runtime information - class CV_EXPORTS Info + // these classes contain OpenCL runtime information + + struct PlatformInfo; + + struct DeviceInfo { - public: - struct Impl; - Impl *impl; + int _id; // reserved, don't use it - Info(); - Info(const Info &m); - ~Info(); - void release(); - Info &operator = (const Info &m); - std::vector DeviceName; + DeviceType deviceType; + std::string deviceProfile; + std::string deviceVersion; + std::string deviceName; + std::string deviceVendor; + int deviceVendorId; + std::string deviceDriverVersion; + std::string deviceExtensions; + + size_t maxWorkGroupSize; + std::vector maxWorkItemSizes; + int maxComputeUnits; + size_t localMemorySize; + + int deviceVersionMajor; + int deviceVersionMinor; + + bool haveDoubleSupport; + bool isUnifiedMemory; // 1 means integrated GPU, otherwise this value is 0 + + std::string compilationExtraOptions; + + const PlatformInfo* platform; + + DeviceInfo(); + }; + + struct PlatformInfo + { + int _id; // reserved, don't use it + + std::string platformProfile; + std::string platformVersion; + std::string platformName; + std::string platformVendor; + std::string platformExtensons; + + int platformVersionMajor; + int platformVersionMinor; + + std::vector devices; + + PlatformInfo(); }; - //////////////////////////////// Initialization & Info //////////////////////// - //this function may be obsoleted - //CV_EXPORTS cl_device_id getDevice(); - //the function must be called before any other cv::ocl::functions, it initialize ocl runtime - //each Info relates to an OpenCL platform - //there is one or more devices in each platform, each one has a separate name - CV_EXPORTS int getDevice(std::vector &oclinfo, int devicetype = CVCL_DEVICE_TYPE_GPU); - //set device you want to use, optional function after getDevice be called - //the devnum is the index of the selected device in DeviceName vector of INfo - CV_EXPORTS void setDevice(Info &oclinfo, int devnum = 0); + //////////////////////////////// Initialization & Info //////////////////////// + typedef std::vector PlatformsInfo; - //The two functions below enable other opencl program to use ocl module's cl_context and cl_command_queue - //returns cl_context * - CV_EXPORTS void* getoclContext(); - //returns cl_command_queue * - CV_EXPORTS void* getoclCommandQueue(); + CV_EXPORTS int getOpenCLPlatforms(PlatformsInfo& platforms); - //explicit call clFinish. The global command queue will be used. - CV_EXPORTS void finish(); + typedef std::vector DevicesInfo; - //this function enable ocl module to use customized cl_context and cl_command_queue - //getDevice also need to be called before this function - CV_EXPORTS void setDeviceEx(Info &oclinfo, void *ctx, void *qu, int devnum = 0); + CV_EXPORTS int getOpenCLDevices(DevicesInfo& devices, int deviceType = CVCL_DEVICE_TYPE_GPU, + const PlatformInfo* platform = NULL); - //returns true when global OpenCL context is initialized - CV_EXPORTS bool initialized(); + // set device you want to use + CV_EXPORTS void setDevice(const DeviceInfo* info); //////////////////////////////// Error handling //////////////////////// CV_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func); - //////////////////////////////// OpenCL context //////////////////////// - //This is a global singleton class used to represent a OpenCL context. + enum FEATURE_TYPE + { + FEATURE_CL_DOUBLE = 1, + FEATURE_CL_UNIFIED_MEM, + FEATURE_CL_VER_1_2 + }; + + // Represents OpenCL context, interface class CV_EXPORTS Context { protected: - Context(); - friend class auto_ptr; - friend bool initialized(); - private: - static auto_ptr clCxt; - static int val; + Context() { } + ~Context() { } public: - ~Context(); - void release(); - Info::Impl* impl; - static Context* getContext(); - static void setContext(Info &oclinfo); - enum {CL_DOUBLE, CL_UNIFIED_MEM, CL_VER_1_2}; - bool supportsFeature(int ftype) const; - size_t computeUnits() const; - void* oclContext(); - void* oclCommandQueue(); + bool supportsFeature(FEATURE_TYPE featureType) const; + const DeviceInfo& getDeviceInfo() const; + + const void* getOpenCLContextPtr() const; + const void* getOpenCLCommandQueuePtr() const; + const void* getOpenCLDeviceIDPtr() const; }; + inline const void *getClContextPtr() + { + return Context::getContext()->getOpenCLContextPtr(); + } + + inline const void *getClCommandQueuePtr() + { + return Context::getContext()->getOpenCLCommandQueuePtr(); + } + + bool CV_EXPORTS supportsFeature(FEATURE_TYPE featureType); + + void CV_EXPORTS finish(); + //! Calls a kernel, by string. Pass globalThreads = NULL, and cleanUp = true, to finally clean-up without executing. CV_EXPORTS double openCLExecuteKernelInterop(Context *clCxt , const char **source, string kernelName, @@ -384,7 +419,7 @@ namespace cv uchar *dataend; //! OpenCL context associated with the oclMat object. - Context *clCxt; + Context *clCxt; // TODO clCtx //add offset for handle ROI, calculated in byte int offset; //add wholerows and wholecols for the whole matrix, datastart and dataend are no longer used @@ -1879,11 +1914,6 @@ namespace cv oclMat temp5; }; - static inline size_t divUp(size_t total, size_t grain) - { - return (total + grain - 1) / grain; - } - /*!***************K Nearest Neighbour*************!*/ class CV_EXPORTS KNearestNeighbour: public CvKNearest { diff --git a/modules/ocl/include/opencv2/ocl/private/util.hpp b/modules/ocl/include/opencv2/ocl/private/util.hpp index 3de0d438d4..2aba472f66 100644 --- a/modules/ocl/include/opencv2/ocl/private/util.hpp +++ b/modules/ocl/include/opencv2/ocl/private/util.hpp @@ -52,120 +52,138 @@ namespace cv { - namespace ocl +namespace ocl +{ + +inline cl_device_id getClDeviceID(const Context *ctx) +{ + return *(cl_device_id*)(ctx->getOpenCLDeviceIDPtr()); +} + +inline cl_context getClContext(const Context *ctx) +{ + return *(cl_context*)(ctx->getOpenCLContextPtr()); +} + +inline cl_command_queue getClCommandQueue(const Context *ctx) +{ + return *(cl_command_queue*)(ctx->getOpenCLCommandQueuePtr()); +} + +enum openCLMemcpyKind +{ + clMemcpyHostToDevice = 0, + clMemcpyDeviceToHost, + clMemcpyDeviceToDevice +}; +///////////////////////////OpenCL call wrappers//////////////////////////// +void CV_EXPORTS openCLMallocPitch(Context *clCxt, void **dev_ptr, size_t *pitch, + size_t widthInBytes, size_t height); +void CV_EXPORTS openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch, + size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type); +void CV_EXPORTS openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch, + const void *src, size_t spitch, + size_t width, size_t height, openCLMemcpyKind kind, int channels = -1); +void CV_EXPORTS openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset, + const void *src, size_t spitch, + size_t width, size_t height, int src_offset); +void CV_EXPORTS openCLFree(void *devPtr); +cl_mem CV_EXPORTS openCLCreateBuffer(Context *clCxt, size_t flag, size_t size); +void CV_EXPORTS openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size); +cl_kernel CV_EXPORTS openCLGetKernelFromSource(const Context *clCxt, + const char **source, std::string kernelName); +cl_kernel CV_EXPORTS openCLGetKernelFromSource(const Context *clCxt, + const char **source, std::string kernelName, const char *build_options); +void CV_EXPORTS openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads); +void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, string kernelName, std::vector< std::pair > &args, + int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1); +void CV_EXPORTS openCLExecuteKernel_(Context *clCxt , const char **source, std::string kernelName, + size_t globalThreads[3], size_t localThreads[3], + std::vector< std::pair > &args, int channels, int depth, const char *build_options); +void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3], + size_t localThreads[3], std::vector< std::pair > &args, int channels, int depth); +void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3], + size_t localThreads[3], std::vector< std::pair > &args, int channels, + int depth, const char *build_options); + +cl_mem CV_EXPORTS load_constant(cl_context context, cl_command_queue command_queue, const void *value, + const size_t size); + +cl_mem CV_EXPORTS openCLMalloc(cl_context clCxt, size_t size, cl_mem_flags flags, void *host_ptr); + +int CV_EXPORTS savetofile(const Context *clcxt, cl_program &program, const char *fileName); + +enum FLUSH_MODE +{ + CLFINISH = 0, + CLFLUSH, + DISABLE +}; + +void CV_EXPORTS openCLExecuteKernel2(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3], + size_t localThreads[3], std::vector< std::pair > &args, int channels, int depth, FLUSH_MODE finish_mode = DISABLE); +void CV_EXPORTS openCLExecuteKernel2(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3], + size_t localThreads[3], std::vector< std::pair > &args, int channels, + int depth, char *build_options, FLUSH_MODE finish_mode = DISABLE); +// bind oclMat to OpenCL image textures +// note: +// 1. there is no memory management. User need to explicitly release the resource +// 2. for faster clamping, there is no buffer padding for the constructed texture +cl_mem CV_EXPORTS bindTexture(const oclMat &mat); +void CV_EXPORTS releaseTexture(cl_mem& texture); + +//Represents an image texture object +class CV_EXPORTS TextureCL +{ +public: + TextureCL(cl_mem tex, int r, int c, int t) + : tex_(tex), rows(r), cols(c), type(t) {} + ~TextureCL() + { + openCLFree(tex_); + } + operator cl_mem() { - enum openCLMemcpyKind - { - clMemcpyHostToDevice = 0, - clMemcpyDeviceToHost, - clMemcpyDeviceToDevice - }; - ///////////////////////////OpenCL call wrappers//////////////////////////// - void CV_EXPORTS openCLMallocPitch(Context *clCxt, void **dev_ptr, size_t *pitch, - size_t widthInBytes, size_t height); - void CV_EXPORTS openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch, - size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type); - void CV_EXPORTS openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch, - const void *src, size_t spitch, - size_t width, size_t height, openCLMemcpyKind kind, int channels = -1); - void CV_EXPORTS openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset, - const void *src, size_t spitch, - size_t width, size_t height, int src_offset); - void CV_EXPORTS openCLFree(void *devPtr); - cl_mem CV_EXPORTS openCLCreateBuffer(Context *clCxt, size_t flag, size_t size); - void CV_EXPORTS openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size); - cl_kernel CV_EXPORTS openCLGetKernelFromSource(const Context *clCxt, - const char **source, std::string kernelName); - cl_kernel CV_EXPORTS openCLGetKernelFromSource(const Context *clCxt, - const char **source, std::string kernelName, const char *build_options); - void CV_EXPORTS openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads); - void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, string kernelName, std::vector< std::pair > &args, - int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1); - void CV_EXPORTS openCLExecuteKernel_(Context *clCxt , const char **source, std::string kernelName, - size_t globalThreads[3], size_t localThreads[3], - std::vector< std::pair > &args, int channels, int depth, const char *build_options); - void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3], - size_t localThreads[3], std::vector< std::pair > &args, int channels, int depth); - void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3], - size_t localThreads[3], std::vector< std::pair > &args, int channels, - int depth, const char *build_options); - - cl_mem CV_EXPORTS load_constant(cl_context context, cl_command_queue command_queue, const void *value, - const size_t size); - - cl_mem CV_EXPORTS openCLMalloc(cl_context clCxt, size_t size, cl_mem_flags flags, void *host_ptr); - - int CV_EXPORTS savetofile(const Context *clcxt, cl_program &program, const char *fileName); - - enum FLUSH_MODE - { - CLFINISH = 0, - CLFLUSH, - DISABLE - }; - - void CV_EXPORTS openCLExecuteKernel2(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3], - size_t localThreads[3], std::vector< std::pair > &args, int channels, int depth, FLUSH_MODE finish_mode = DISABLE); - void CV_EXPORTS openCLExecuteKernel2(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3], - size_t localThreads[3], std::vector< std::pair > &args, int channels, - int depth, char *build_options, FLUSH_MODE finish_mode = DISABLE); - // bind oclMat to OpenCL image textures - // note: - // 1. there is no memory management. User need to explicitly release the resource - // 2. for faster clamping, there is no buffer padding for the constructed texture - cl_mem CV_EXPORTS bindTexture(const oclMat &mat); - void CV_EXPORTS releaseTexture(cl_mem& texture); - - //Represents an image texture object - class CV_EXPORTS TextureCL - { - public: - TextureCL(cl_mem tex, int r, int c, int t) - : tex_(tex), rows(r), cols(c), type(t) {} - ~TextureCL() - { - openCLFree(tex_); - } - operator cl_mem() - { - return tex_; - } - cl_mem const tex_; - const int rows; - const int cols; - const int type; - private: - //disable assignment - void operator=(const TextureCL&); - }; - // bind oclMat to OpenCL image textures and retunrs an TextureCL object - // note: - // for faster clamping, there is no buffer padding for the constructed texture - Ptr CV_EXPORTS bindTexturePtr(const oclMat &mat); - - // returns whether the current context supports image2d_t format or not - bool CV_EXPORTS support_image2d(Context *clCxt = Context::getContext()); - - // the enums are used to query device information - // currently only support wavefront size queries - enum DEVICE_INFO - { - WAVEFRONT_SIZE, //in AMD speak - IS_CPU_DEVICE //check if the device is CPU - }; - template - _ty queryDeviceInfo(cl_kernel kernel = NULL); - - template<> - int CV_EXPORTS queryDeviceInfo(cl_kernel kernel); - template<> - size_t CV_EXPORTS queryDeviceInfo(cl_kernel kernel); - template<> - bool CV_EXPORTS queryDeviceInfo(cl_kernel kernel); - - unsigned long CV_EXPORTS queryLocalMemInfo(); - }//namespace ocl + return tex_; + } + cl_mem const tex_; + const int rows; + const int cols; + const int type; +private: + //disable assignment + void operator=(const TextureCL&); +}; +// bind oclMat to OpenCL image textures and retunrs an TextureCL object +// note: +// for faster clamping, there is no buffer padding for the constructed texture +Ptr CV_EXPORTS bindTexturePtr(const oclMat &mat); + +// returns whether the current context supports image2d_t format or not +bool CV_EXPORTS support_image2d(Context *clCxt = Context::getContext()); + +bool CV_EXPORTS isCpuDevice(); + +size_t CV_EXPORTS queryWaveFrontSize(cl_kernel kernel); + + +inline size_t divUp(size_t total, size_t grain) +{ + return (total + grain - 1) / grain; +} + +inline size_t roundUp(size_t sz, size_t n) +{ + // we don't assume that n is a power of 2 (see alignSize) + // equal to divUp(sz, n) * n + size_t t = sz + n - 1; + size_t rem = t % n; + size_t result = t - rem; + return result; +} + +}//namespace ocl }//namespace cv #endif //__OPENCV_OCL_PRIVATE_UTIL__ diff --git a/modules/ocl/perf/main.cpp b/modules/ocl/perf/main.cpp index e24c2c14e5..e82af4e322 100644 --- a/modules/ocl/perf/main.cpp +++ b/modules/ocl/perf/main.cpp @@ -56,7 +56,7 @@ int main(int argc, char ** argv) const char * keys = "{ h | help | false | print help message }" "{ t | type | gpu | set device type:cpu or gpu}" - "{ p | platform | 0 | set platform id }" + "{ p | platform | -1 | set platform id }" "{ d | device | 0 | set device id }"; CommandLineParser cmd(argc, argv, keys); @@ -68,28 +68,34 @@ int main(int argc, char ** argv) } string type = cmd.get("type"); - unsigned int pid = cmd.get("platform"); + int pid = cmd.get("platform"); int device = cmd.get("device"); int flag = type == "cpu" ? cv::ocl::CVCL_DEVICE_TYPE_CPU : cv::ocl::CVCL_DEVICE_TYPE_GPU; - std::vector oclinfo; - int devnums = cv::ocl::getDevice(oclinfo, flag); - if (devnums <= device || device < 0) + cv::ocl::PlatformsInfo platformsInfo; + cv::ocl::getOpenCLPlatforms(platformsInfo); + if (pid >= (int)platformsInfo.size()) { - std::cout << "device invalid\n"; - return -1; + std::cout << "platform is invalid\n"; + return 1; } - if (pid >= oclinfo.size()) + cv::ocl::DevicesInfo devicesInfo; + int devnums = cv::ocl::getOpenCLDevices(devicesInfo, flag, (pid < 0) ? NULL : platformsInfo[pid]); + if (device < 0 || device >= devnums) { - std::cout << "platform invalid\n"; - return -1; + std::cout << "device/platform invalid\n"; + return 1; } - cv::ocl::setDevice(oclinfo[pid], device); + cv::ocl::setDevice(devicesInfo[device]); cv::ocl::setBinaryDiskCache(cv::ocl::CACHE_UPDATE); + cout << "Device type:" << type << endl + << "Platform name:" << devicesInfo[device]->platform->platformName << endl + << "Device name:" << devicesInfo[device]->deviceName << endl; + CV_PERF_TEST_MAIN_INTERNALS(ocl, impls) } diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp index 2a663b990a..7d97e67545 100644 --- a/modules/ocl/src/arithm.cpp +++ b/modules/ocl/src/arithm.cpp @@ -51,50 +51,10 @@ //M*/ #include "precomp.hpp" -#include - +#include "opencl_kernels.hpp" using namespace cv; using namespace cv::ocl; -using namespace std; - -namespace cv -{ - namespace ocl - { - //////////////////////////////// OpenCL kernel strings ///////////////////// - - extern const char *arithm_absdiff_nonsaturate; - extern const char *arithm_nonzero; - extern const char *arithm_sum; - extern const char *arithm_minMax; - extern const char *arithm_minMaxLoc; - extern const char *arithm_minMaxLoc_mask; - extern const char *arithm_LUT; - extern const char *arithm_add; - extern const char *arithm_add_mask; - extern const char *arithm_add_scalar; - extern const char *arithm_add_scalar_mask; - extern const char *arithm_bitwise_binary; - extern const char *arithm_bitwise_binary_mask; - extern const char *arithm_bitwise_binary_scalar; - extern const char *arithm_bitwise_binary_scalar_mask; - extern const char *arithm_bitwise_not; - extern const char *arithm_compare; - extern const char *arithm_transpose; - extern const char *arithm_flip; - extern const char *arithm_flip_rc; - extern const char *arithm_magnitude; - extern const char *arithm_cartToPolar; - extern const char *arithm_polarToCart; - extern const char *arithm_exp; - extern const char *arithm_log; - extern const char *arithm_addWeighted; - extern const char *arithm_phase; - extern const char *arithm_pow; - extern const char *arithm_setidentity; - } -} ////////////////////////////////////////////////////////////////////////////// /////////////////////// add subtract multiply divide ///////////////////////// @@ -106,7 +66,7 @@ static void arithmetic_run_generic(const oclMat &src1, const oclMat &src2, const oclMat &dst, int op_type, bool use_scalar = false) { Context *clCxt = src1.clCxt; - bool hasDouble = clCxt->supportsFeature(Context::CL_DOUBLE); + bool hasDouble = clCxt->supportsFeature(FEATURE_CL_DOUBLE); if (!hasDouble && (src1.depth() == CV_64F || src2.depth() == CV_64F || dst.depth() == CV_64F)) { CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n"); @@ -264,7 +224,7 @@ void cv::ocl::absdiff(const oclMat &src1, const Scalar &src2, oclMat &dst) ////////////////////////////////////////////////////////////////////////////// static void compare_run(const oclMat &src1, const oclMat &src2, oclMat &dst, int cmpOp, - string kernelName, const char **kernelString) + string kernelName, const cv::ocl::ProgramEntry* source) { CV_Assert(src1.type() == src2.type()); dst.create(src1.size(), CV_8UC1); @@ -295,13 +255,13 @@ static void compare_run(const oclMat &src1, const oclMat &src2, oclMat &dst, int args.push_back( make_pair( sizeof(cl_int), (void *)&src1.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows )); - openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, + openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, -1, -1, buildOptions.c_str()); } void cv::ocl::compare(const oclMat &src1, const oclMat &src2, oclMat &dst , int cmpOp) { - if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.depth() == CV_64F) + if (!src1.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src1.depth() == CV_64F) { cout << "Selected device do not support double" << endl; return; @@ -358,7 +318,7 @@ Scalar arithmetic_sum(const oclMat &src, int type, int ddepth) { CV_Assert(src.step % src.elemSize() == 0); - size_t groupnum = src.clCxt->computeUnits(); + size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits; CV_Assert(groupnum != 0); int dbsize = groupnum * src.oclchannels(); @@ -385,7 +345,7 @@ typedef Scalar (*sumFunc)(const oclMat &src, int type, int ddepth); Scalar cv::ocl::sum(const oclMat &src) { - if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F) + if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F) { CV_Error(CV_GpuNotSupported, "Selected device doesn't support double"); } @@ -396,7 +356,7 @@ Scalar cv::ocl::sum(const oclMat &src) arithmetic_sum }; - bool hasDouble = src.clCxt->supportsFeature(Context::CL_DOUBLE); + bool hasDouble = src.clCxt->supportsFeature(FEATURE_CL_DOUBLE); int ddepth = std::max(src.depth(), CV_32S); if (!hasDouble && ddepth == CV_64F) ddepth = CV_32F; @@ -407,7 +367,7 @@ Scalar cv::ocl::sum(const oclMat &src) Scalar cv::ocl::absSum(const oclMat &src) { - if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F) + if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F) { CV_Error(CV_GpuNotSupported, "Selected device doesn't support double"); } @@ -418,7 +378,7 @@ Scalar cv::ocl::absSum(const oclMat &src) arithmetic_sum }; - bool hasDouble = src.clCxt->supportsFeature(Context::CL_DOUBLE); + bool hasDouble = src.clCxt->supportsFeature(FEATURE_CL_DOUBLE); int ddepth = std::max(src.depth(), CV_32S); if (!hasDouble && ddepth == CV_64F) ddepth = CV_32F; @@ -429,7 +389,7 @@ Scalar cv::ocl::absSum(const oclMat &src) Scalar cv::ocl::sqrSum(const oclMat &src) { - if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F) + if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F) { CV_Error(CV_GpuNotSupported, "Selected device doesn't support double"); } @@ -440,7 +400,7 @@ Scalar cv::ocl::sqrSum(const oclMat &src) arithmetic_sum }; - bool hasDouble = src.clCxt->supportsFeature(Context::CL_DOUBLE); + bool hasDouble = src.clCxt->supportsFeature(FEATURE_CL_DOUBLE); int ddepth = src.depth() <= CV_32S ? CV_32S : (hasDouble ? CV_64F : CV_32F); sumFunc func = functab[ddepth - CV_32S]; @@ -524,7 +484,7 @@ template void arithmetic_minMax(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat &buf) { - size_t groupnum = src.clCxt->computeUnits(); + size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits; CV_Assert(groupnum != 0); int dbsize = groupnum * 2 * src.elemSize(); @@ -566,7 +526,7 @@ void cv::ocl::minMax_buf(const oclMat &src, double *minVal, double *maxVal, cons if (minVal == NULL && maxVal == NULL) return; - if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F) + if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F) { CV_Error(CV_GpuNotSupported, "Selected device doesn't support double"); } @@ -699,7 +659,7 @@ double cv::ocl::norm(const oclMat &src1, const oclMat &src2, int normType) static void arithmetic_flip_rows_run(const oclMat &src, oclMat &dst, string kernelName) { - if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F) + if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.type() == CV_64F) { CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n"); return; @@ -746,7 +706,7 @@ static void arithmetic_flip_rows_run(const oclMat &src, oclMat &dst, string kern static void arithmetic_flip_cols_run(const oclMat &src, oclMat &dst, string kernelName, bool isVertical) { - if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F) + if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.type() == CV_64F) { CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n"); return; @@ -792,9 +752,9 @@ static void arithmetic_flip_cols_run(const oclMat &src, oclMat &dst, string kern args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 )); - const char **kernelString = isVertical ? &arithm_flip_rc : &arithm_flip; + const cv::ocl::ProgramEntry* source = isVertical ? &arithm_flip_rc : &arithm_flip; - openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, src.oclchannels(), depth); + openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, src.oclchannels(), depth); } void cv::ocl::flip(const oclMat &src, oclMat &dst, int flipCode) @@ -860,10 +820,10 @@ void cv::ocl::LUT(const oclMat &src, const oclMat &lut, oclMat &dst) //////////////////////////////// exp log ///////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// -static void arithmetic_exp_log_run(const oclMat &src, oclMat &dst, string kernelName, const char **kernelString) +static void arithmetic_exp_log_run(const oclMat &src, oclMat &dst, string kernelName, const cv::ocl::ProgramEntry* source) { Context *clCxt = src.clCxt; - if (!clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F) + if (!clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F) { CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n"); return; @@ -893,7 +853,7 @@ static void arithmetic_exp_log_run(const oclMat &src, oclMat &dst, string kernel args.push_back( make_pair( sizeof(cl_int), (void *)&srcstep1 )); args.push_back( make_pair( sizeof(cl_int), (void *)&dststep1 )); - openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, + openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, src.oclchannels(), -1, buildOptions.c_str()); } @@ -913,7 +873,7 @@ void cv::ocl::log(const oclMat &src, oclMat &dst) static void arithmetic_magnitude_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName) { - if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F) + if (!src1.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src1.type() == CV_64F) { CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n"); return; @@ -955,9 +915,9 @@ void cv::ocl::magnitude(const oclMat &src1, const oclMat &src2, oclMat &dst) arithmetic_magnitude_phase_run(src1, src2, dst, "arithm_magnitude"); } -static void arithmetic_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName, const char **kernelString) +static void arithmetic_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName, const cv::ocl::ProgramEntry* source) { - if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F) + if (!src1.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src1.type() == CV_64F) { CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n"); return; @@ -985,7 +945,7 @@ static void arithmetic_phase_run(const oclMat &src1, const oclMat &src2, oclMat args.push_back( make_pair( sizeof(cl_int), (void *)&cols1 )); args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows )); - openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth); + openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, -1, depth); } void cv::ocl::phase(const oclMat &x, const oclMat &y, oclMat &Angle, bool angleInDegrees) @@ -1004,7 +964,7 @@ void cv::ocl::phase(const oclMat &x, const oclMat &y, oclMat &Angle, bool angleI static void arithmetic_cartToPolar_run(const oclMat &src1, const oclMat &src2, oclMat &dst_mag, oclMat &dst_cart, string kernelName, bool angleInDegrees) { - if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F) + if (!src1.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src1.type() == CV_64F) { CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n"); return; @@ -1057,7 +1017,7 @@ void cv::ocl::cartToPolar(const oclMat &x, const oclMat &y, oclMat &mag, oclMat static void arithmetic_ptc_run(const oclMat &src1, const oclMat &src2, oclMat &dst1, oclMat &dst2, bool angleInDegrees, string kernelName) { - if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F) + if (!src1.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src1.type() == CV_64F) { CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n"); return; @@ -1176,7 +1136,7 @@ void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal, Point *minLoc, Point *maxLoc, const oclMat &mask) { CV_Assert(src.oclchannels() == 1); - size_t groupnum = src.clCxt->computeUnits(); + size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits; CV_Assert(groupnum != 0); int minloc = -1 , maxloc = -1; int vlen = 4, dbsize = groupnum * vlen * 4 * sizeof(T) ; @@ -1238,7 +1198,7 @@ typedef void (*minMaxLocFunc)(const oclMat &src, double *minVal, double *maxVal, void cv::ocl::minMaxLoc(const oclMat &src, double *minVal, double *maxVal, Point *minLoc, Point *maxLoc, const oclMat &mask) { - if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F) + if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F) { CV_Error(CV_GpuNotSupported, "Selected device doesn't support double"); return; @@ -1251,7 +1211,7 @@ void cv::ocl::minMaxLoc(const oclMat &src, double *minVal, double *maxVal, }; minMaxLocFunc func; - func = functab[(int)src.clCxt->supportsFeature(Context::CL_DOUBLE)]; + func = functab[(int)src.clCxt->supportsFeature(FEATURE_CL_DOUBLE)]; func(src, minVal, maxVal, minLoc, maxLoc, mask); } @@ -1296,7 +1256,7 @@ int cv::ocl::countNonZero(const oclMat &src) CV_Assert(src.channels() == 1); Context *clCxt = src.clCxt; - if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F) + if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F) { CV_Error(CV_GpuNotSupported, "selected device doesn't support double"); } @@ -1327,7 +1287,7 @@ int cv::ocl::countNonZero(const oclMat &src) ////////////////////////////////bitwise_op//////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// -static void bitwise_unary_run(const oclMat &src1, oclMat &dst, string kernelName, const char **kernelString) +static void bitwise_unary_run(const oclMat &src1, oclMat &dst, string kernelName, const cv::ocl::ProgramEntry* source) { dst.create(src1.size(), src1.type()); @@ -1361,7 +1321,7 @@ static void bitwise_unary_run(const oclMat &src1, oclMat &dst, string kernelName args.push_back( make_pair( sizeof(cl_int), (void *)&cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 )); - openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth); + openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, -1, depth); } enum { AND = 0, OR, XOR }; @@ -1370,7 +1330,7 @@ static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Sca oclMat &dst, int operationType) { Context *clCxt = src1.clCxt; - if (!clCxt->supportsFeature(Context::CL_DOUBLE) && src1.depth() == CV_64F) + if (!clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src1.depth() == CV_64F) { cout << "Selected device does not support double" << endl; return; @@ -1442,7 +1402,7 @@ static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Sca void cv::ocl::bitwise_not(const oclMat &src, oclMat &dst) { - if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F) + if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.type() == CV_64F) { cout << "Selected device does not support double" << endl; return; @@ -1571,7 +1531,7 @@ oclMatExpr::operator oclMat() const static void transpose_run(const oclMat &src, oclMat &dst, string kernelName, bool inplace = false) { Context *clCxt = src.clCxt; - if (!clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F) + if (!clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F) { CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n"); return; @@ -1623,7 +1583,7 @@ void cv::ocl::transpose(const oclMat &src, oclMat &dst) void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2, double beta, double gama, oclMat &dst) { Context *clCxt = src1.clCxt; - bool hasDouble = clCxt->supportsFeature(Context::CL_DOUBLE); + bool hasDouble = clCxt->supportsFeature(FEATURE_CL_DOUBLE); if (!hasDouble && src1.depth() == CV_64F) { CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n"); @@ -1688,7 +1648,7 @@ void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2, /////////////////////////////////// Pow ////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// -static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string kernelName, const char **kernelString) +static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string kernelName, const cv::ocl::ProgramEntry* source) { CV_Assert(src1.cols == dst.cols && src1.rows == dst.rows); CV_Assert(src1.type() == dst.type()); @@ -1718,17 +1678,17 @@ static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 )); float pf = static_cast(p); - if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE)) + if (!src1.clCxt->supportsFeature(FEATURE_CL_DOUBLE)) args.push_back( make_pair( sizeof(cl_float), (void *)&pf )); else args.push_back( make_pair( sizeof(cl_double), (void *)&p )); - openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth); + openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, -1, depth); } void cv::ocl::pow(const oclMat &x, double p, oclMat &y) { - if (!x.clCxt->supportsFeature(Context::CL_DOUBLE) && x.type() == CV_64F) + if (!x.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && x.type() == CV_64F) { cout << "Selected device do not support double" << endl; return; diff --git a/modules/ocl/src/bgfg_mog.cpp b/modules/ocl/src/bgfg_mog.cpp index 3051ac82f3..cb0dee80f8 100644 --- a/modules/ocl/src/bgfg_mog.cpp +++ b/modules/ocl/src/bgfg_mog.cpp @@ -392,7 +392,7 @@ void cv::ocl::device::mog::loadConstants(float Tb, float TB, float Tg, float var constants->c_tau = tau; constants->c_shadowVal = shadowVal; - cl_constants = load_constant(*((cl_context*)getoclContext()), *((cl_command_queue*)getoclCommandQueue()), + cl_constants = load_constant(*((cl_context*)getClContextPtr()), *((cl_command_queue*)getClCommandQueuePtr()), (void *)constants, sizeof(_contant_struct)); } @@ -635,4 +635,4 @@ void cv::ocl::MOG2::release() mean_.release(); bgmodelUsedModes_.release(); -} \ No newline at end of file +} diff --git a/modules/ocl/src/binarycaching.hpp b/modules/ocl/src/binarycaching.hpp index 0ec565f88b..cc9e71a330 100644 --- a/modules/ocl/src/binarycaching.hpp +++ b/modules/ocl/src/binarycaching.hpp @@ -50,41 +50,36 @@ using namespace std; using std::cout; using std::endl; -namespace cv +namespace cv { namespace ocl { + +class ProgramCache { - namespace ocl - { - class ProgramCache - { - protected: - ProgramCache(); - friend class auto_ptr; - static auto_ptr programCache; +protected: + ProgramCache(); + ~ProgramCache(); + friend class std::auto_ptr; +public: + static ProgramCache *getProgramCache(); - public: - ~ProgramCache(); - static ProgramCache *getProgramCache() - { - if( NULL == programCache.get()) - programCache.reset(new ProgramCache()); - return programCache.get(); - } + cl_program getProgram(const Context *ctx, const char **source, string kernelName, + const char *build_options); - //lookup the binary given the file name - cl_program progLookup(string srcsign); + void releaseProgram(); +protected: + //lookup the binary given the file name + cl_program progLookup(string srcsign); - //add program to the cache - void addProgram(string srcsign, cl_program program); - void releaseProgram(); + //add program to the cache + void addProgram(string srcsign, cl_program program); - map codeCache; - unsigned int cacheSize; - //The presumed watermark for the cache volume (256MB). Is it enough? - //We may need more delicate algorithms when necessary later. - //Right now, let's just leave it along. - static const unsigned MAX_PROG_CACHE_SIZE = 1024; - }; + map codeCache; + unsigned int cacheSize; - }//namespace ocl + //The presumed watermark for the cache volume (256MB). Is it enough? + //We may need more delicate algorithms when necessary later. + //Right now, let's just leave it along. + static const unsigned MAX_PROG_CACHE_SIZE = 1024; +}; +}//namespace ocl }//namespace cv diff --git a/modules/ocl/src/brute_force_matcher.cpp b/modules/ocl/src/brute_force_matcher.cpp index 74da6ddd06..0273ed5891 100644 --- a/modules/ocl/src/brute_force_matcher.cpp +++ b/modules/ocl/src/brute_force_matcher.cpp @@ -245,7 +245,7 @@ static void matchDispatcher(const oclMat &query, const oclMat &train, const oclM { const oclMat zeroMask; const oclMat &tempMask = mask.data ? mask : zeroMask; - bool is_cpu = queryDeviceInfo(); + bool is_cpu = isCpuDevice(); if (query.cols <= 64) { matchUnrolledCached<16, 64>(query, train, tempMask, trainIdx, distance, distType); @@ -265,7 +265,7 @@ static void matchDispatcher(const oclMat &query, const oclMat *trains, int n, co { const oclMat zeroMask; const oclMat &tempMask = mask.data ? mask : zeroMask; - bool is_cpu = queryDeviceInfo(); + bool is_cpu = isCpuDevice(); if (query.cols <= 64) { matchUnrolledCached<16, 64>(query, trains, n, tempMask, trainIdx, imgIdx, distance, distType); @@ -286,7 +286,7 @@ static void matchDispatcher(const oclMat &query, const oclMat &train, float maxD { const oclMat zeroMask; const oclMat &tempMask = mask.data ? mask : zeroMask; - bool is_cpu = queryDeviceInfo(); + bool is_cpu = isCpuDevice(); if (query.cols <= 64) { matchUnrolledCached<16, 64>(query, train, maxDistance, tempMask, trainIdx, distance, nMatches, distType); @@ -469,7 +469,7 @@ static void calcDistanceDispatcher(const oclMat &query, const oclMat &train, con static void match2Dispatcher(const oclMat &query, const oclMat &train, const oclMat &mask, const oclMat &trainIdx, const oclMat &distance, int distType) { - bool is_cpu = queryDeviceInfo(); + bool is_cpu = isCpuDevice(); if (query.cols <= 64) { knn_matchUnrolledCached<16, 64>(query, train, mask, trainIdx, distance, distType); diff --git a/modules/ocl/src/canny.cpp b/modules/ocl/src/canny.cpp index 4c7b988f6f..a25c1973ef 100644 --- a/modules/ocl/src/canny.cpp +++ b/modules/ocl/src/canny.cpp @@ -98,7 +98,7 @@ void cv::ocl::CannyBuf::create(const Size &image_size, int apperture_size) { openCLFree(counter); } - counter = clCreateBuffer( *((cl_context*)getoclContext()), CL_MEM_COPY_HOST_PTR, sizeof(int), counter_i, &err ); + counter = clCreateBuffer( *((cl_context*)getClContextPtr()), CL_MEM_COPY_HOST_PTR, sizeof(int), counter_i, &err ); openCLSafeCall(err); } @@ -354,7 +354,7 @@ void canny::edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, void *counter, in void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, void *counter, int rows, int cols) { unsigned int count; - openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(float), &count, 0, NULL, NULL)); + openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getClCommandQueuePtr(), (cl_mem)counter, 1, 0, sizeof(float), &count, 0, NULL, NULL)); Context *clCxt = map.clCxt; string kernelName = "edgesHysteresisGlobal"; vector< pair > args; @@ -363,7 +363,7 @@ void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, voi int count_i[1] = {0}; while(count > 0) { - openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(int), &count_i, 0, NULL, NULL)); + openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)getClCommandQueuePtr(), (cl_mem)counter, 1, 0, sizeof(int), &count_i, 0, NULL, NULL)); args.clear(); size_t globalThreads[3] = {std::min(count, 65535u) * 128, divUp(count, 65535), 1}; @@ -378,7 +378,7 @@ void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, voi args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset)); openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1); - openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL)); + openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getClCommandQueuePtr(), (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL)); std::swap(st1, st2); } } diff --git a/modules/ocl/src/cl_context.cpp b/modules/ocl/src/cl_context.cpp new file mode 100644 index 0000000000..6413465f65 --- /dev/null +++ b/modules/ocl/src/cl_context.cpp @@ -0,0 +1,507 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Guoping Long, longguoping@gmail.com +// Niko Li, newlife20080214@gmail.com +// Yao Wang, bitwangyaoyao@gmail.com +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" +#include +#include +#include "binarycaching.hpp" + +#undef __CL_ENABLE_EXCEPTIONS +#include + +namespace cv { namespace ocl { + +extern void fft_teardown(); +extern void clBlasTeardown(); + +struct PlatformInfoImpl +{ + cl_platform_id platform_id; + + std::vector deviceIDs; + + PlatformInfo info; + + PlatformInfoImpl() + : platform_id(NULL) + { + } +}; + +struct DeviceInfoImpl +{ + cl_platform_id platform_id; + cl_device_id device_id; + + DeviceInfo info; + + DeviceInfoImpl() + : platform_id(NULL), device_id(NULL) + { + } +}; + +static std::vector global_platforms; +static std::vector global_devices; + +static bool parseOpenCLVersion(const std::string& versionStr, int& major, int& minor) +{ + size_t p0 = versionStr.find(' '); + while (true) + { + if (p0 == std::string::npos) + break; + if (p0 + 1 >= versionStr.length()) + break; + char c = versionStr[p0 + 1]; + if (isdigit(c)) + break; + p0 = versionStr.find(' ', p0 + 1); + } + size_t p1 = versionStr.find('.', p0); + size_t p2 = versionStr.find(' ', p1); + if (p0 == std::string::npos || p1 == std::string::npos || p2 == std::string::npos) + { + major = 0; + minor = 0; + return false; + } + std::string majorStr = versionStr.substr(p0 + 1, p1 - p0 - 1); + std::string minorStr = versionStr.substr(p1 + 1, p2 - p1 - 1); + major = atoi(majorStr.c_str()); + minor = atoi(minorStr.c_str()); + return true; +} + +static int initializeOpenCLDevices() +{ + assert(global_devices.size() == 0); + + std::vector platforms; + try + { + openCLSafeCall(cl::Platform::get(&platforms)); + } + catch (cv::Exception& e) + { + return 0; // OpenCL not found + } + + global_platforms.resize(platforms.size()); + + for (size_t i = 0; i < platforms.size(); ++i) + { + PlatformInfoImpl& platformInfo = global_platforms[i]; + platformInfo.info._id = i; + + cl::Platform& platform = platforms[i]; + + platformInfo.platform_id = platform(); + openCLSafeCall(platform.getInfo(CL_PLATFORM_PROFILE, &platformInfo.info.platformProfile)); + openCLSafeCall(platform.getInfo(CL_PLATFORM_VERSION, &platformInfo.info.platformVersion)); + openCLSafeCall(platform.getInfo(CL_PLATFORM_NAME, &platformInfo.info.platformName)); + openCLSafeCall(platform.getInfo(CL_PLATFORM_VENDOR, &platformInfo.info.platformVendor)); + openCLSafeCall(platform.getInfo(CL_PLATFORM_EXTENSIONS, &platformInfo.info.platformExtensons)); + + parseOpenCLVersion(platformInfo.info.platformVersion, + platformInfo.info.platformVersionMajor, platformInfo.info.platformVersionMinor); + + std::vector devices; + cl_int status = platform.getDevices(CL_DEVICE_TYPE_ALL, &devices); + if(status != CL_DEVICE_NOT_FOUND) + openCLVerifyCall(status); + + if(devices.size() > 0) + { + int baseIndx = global_devices.size(); + global_devices.resize(baseIndx + devices.size()); + platformInfo.deviceIDs.resize(devices.size()); + platformInfo.info.devices.resize(devices.size()); + + for(size_t j = 0; j < devices.size(); ++j) + { + cl::Device& device = devices[j]; + + DeviceInfoImpl& deviceInfo = global_devices[baseIndx + j]; + deviceInfo.info._id = baseIndx + j; + deviceInfo.platform_id = platform(); + deviceInfo.device_id = device(); + + deviceInfo.info.platform = &platformInfo.info; + platformInfo.deviceIDs[j] = deviceInfo.info._id; + + cl_device_type type = -1; + openCLSafeCall(device.getInfo(CL_DEVICE_TYPE, &type)); + deviceInfo.info.deviceType = DeviceType(type); + + openCLSafeCall(device.getInfo(CL_DEVICE_PROFILE, &deviceInfo.info.deviceProfile)); + openCLSafeCall(device.getInfo(CL_DEVICE_VERSION, &deviceInfo.info.deviceVersion)); + openCLSafeCall(device.getInfo(CL_DEVICE_NAME, &deviceInfo.info.deviceName)); + openCLSafeCall(device.getInfo(CL_DEVICE_VENDOR, &deviceInfo.info.deviceVendor)); + cl_uint vendorID = -1; + openCLSafeCall(device.getInfo(CL_DEVICE_VENDOR_ID, &vendorID)); + deviceInfo.info.deviceVendorId = vendorID; + openCLSafeCall(device.getInfo(CL_DRIVER_VERSION, &deviceInfo.info.deviceDriverVersion)); + openCLSafeCall(device.getInfo(CL_DEVICE_EXTENSIONS, &deviceInfo.info.deviceExtensions)); + + parseOpenCLVersion(deviceInfo.info.deviceVersion, + deviceInfo.info.deviceVersionMajor, deviceInfo.info.deviceVersionMinor); + + size_t maxWorkGroupSize = 0; + openCLSafeCall(device.getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE, &maxWorkGroupSize)); + deviceInfo.info.maxWorkGroupSize = maxWorkGroupSize; + + cl_uint maxDimensions = 0; + openCLSafeCall(device.getInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, &maxDimensions)); + std::vector maxWorkItemSizes(maxDimensions); + openCLSafeCall(clGetDeviceInfo(device(), CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * maxDimensions, + (void *)&maxWorkItemSizes[0], 0)); + deviceInfo.info.maxWorkItemSizes = maxWorkItemSizes; + + cl_uint maxComputeUnits = 0; + openCLSafeCall(device.getInfo(CL_DEVICE_MAX_COMPUTE_UNITS, &maxComputeUnits)); + deviceInfo.info.maxComputeUnits = maxComputeUnits; + + cl_ulong localMemorySize = 0; + openCLSafeCall(device.getInfo(CL_DEVICE_LOCAL_MEM_SIZE, &localMemorySize)); + deviceInfo.info.localMemorySize = (size_t)localMemorySize; + + + cl_bool unifiedMemory = false; + openCLSafeCall(device.getInfo(CL_DEVICE_HOST_UNIFIED_MEMORY, &unifiedMemory)); + deviceInfo.info.isUnifiedMemory = unifiedMemory != 0; + + //initialize extra options for compilation. Currently only fp64 is included. + //Assume 4KB is enough to store all possible extensions. + openCLSafeCall(device.getInfo(CL_DEVICE_EXTENSIONS, &deviceInfo.info.deviceExtensions)); + + size_t fp64_khr = deviceInfo.info.deviceExtensions.find("cl_khr_fp64"); + if(fp64_khr != std::string::npos) + { + deviceInfo.info.compilationExtraOptions += "-D DOUBLE_SUPPORT"; + deviceInfo.info.haveDoubleSupport = true; + } + else + { + deviceInfo.info.haveDoubleSupport = false; + } + } + } + } + + for (size_t i = 0; i < platforms.size(); ++i) + { + PlatformInfoImpl& platformInfo = global_platforms[i]; + for(size_t j = 0; j < platformInfo.deviceIDs.size(); ++j) + { + DeviceInfoImpl& deviceInfo = global_devices[platformInfo.deviceIDs[j]]; + platformInfo.info.devices[j] = &deviceInfo.info; + } + } + + return global_devices.size(); +} + + +DeviceInfo::DeviceInfo() + : _id(-1), deviceType(DeviceType(0)), + deviceVendorId(-1), + maxWorkGroupSize(0), maxComputeUnits(0), localMemorySize(0), + deviceVersionMajor(0), deviceVersionMinor(0), + haveDoubleSupport(false), isUnifiedMemory(false), + platform(NULL) +{ + // nothing +} + +PlatformInfo::PlatformInfo() + : _id(-1), + platformVersionMajor(0), platformVersionMinor(0) +{ + // nothing +} + +//////////////////////////////// OpenCL context //////////////////////// +//This is a global singleton class used to represent a OpenCL context. +class ContextImpl : public Context +{ +public: + const cl_device_id clDeviceID; + cl_context clContext; + cl_command_queue clCmdQueue; + const DeviceInfo& deviceInfo; + +protected: + ContextImpl(const DeviceInfo& deviceInfo, cl_device_id clDeviceID) + : clDeviceID(clDeviceID), clContext(NULL), clCmdQueue(NULL), deviceInfo(deviceInfo) + { + // nothing + } + ~ContextImpl(); +public: + + static ContextImpl* getContext(); + static void setContext(const DeviceInfo* deviceInfo); + + bool supportsFeature(FEATURE_TYPE featureType) const; + + static void cleanupContext(void); +}; + +static cv::Mutex currentContextMutex; +static ContextImpl* currentContext = NULL; + +Context* Context::getContext() +{ + return currentContext; +} + +bool Context::supportsFeature(FEATURE_TYPE featureType) const +{ + return ((ContextImpl*)this)->supportsFeature(featureType); +} + +const DeviceInfo& Context::getDeviceInfo() const +{ + return ((ContextImpl*)this)->deviceInfo; +} + +const void* Context::getOpenCLContextPtr() const +{ + return &(((ContextImpl*)this)->clContext); +} + +const void* Context::getOpenCLCommandQueuePtr() const +{ + return &(((ContextImpl*)this)->clCmdQueue); +} + +const void* Context::getOpenCLDeviceIDPtr() const +{ + return &(((ContextImpl*)this)->clDeviceID); +} + + +bool ContextImpl::supportsFeature(FEATURE_TYPE featureType) const +{ + switch (featureType) + { + case FEATURE_CL_DOUBLE: + return deviceInfo.haveDoubleSupport; + case FEATURE_CL_UNIFIED_MEM: + return deviceInfo.isUnifiedMemory; + case FEATURE_CL_VER_1_2: + return deviceInfo.deviceVersionMajor > 1 || (deviceInfo.deviceVersionMajor == 1 && deviceInfo.deviceVersionMinor >= 2); + } + CV_Error(CV_StsBadArg, "Invalid feature type"); + return false; +} + +#if defined(WIN32) +static bool __termination = false; +#endif + +ContextImpl::~ContextImpl() +{ + fft_teardown(); + clBlasTeardown(); + +#ifdef WIN32 + // if process is on termination stage (ExitProcess was called and other threads were terminated) + // then disable command queue release because it may cause program hang + if (!__termination) +#endif + { + if(clCmdQueue) + { + openCLSafeCall(clReleaseCommandQueue(clCmdQueue)); // some cleanup problems are here + } + + if(clContext) + { + openCLSafeCall(clReleaseContext(clContext)); + } + } + clCmdQueue = NULL; + clContext = NULL; +} + +void ContextImpl::cleanupContext(void) +{ + cv::AutoLock lock(currentContextMutex); + if (currentContext) + delete currentContext; + currentContext = NULL; +} + +void ContextImpl::setContext(const DeviceInfo* deviceInfo) +{ + CV_Assert(deviceInfo->_id >= 0 && deviceInfo->_id < (int)global_devices.size()); + + DeviceInfoImpl& infoImpl = global_devices[deviceInfo->_id]; + CV_Assert(deviceInfo == &infoImpl.info); + + cl_int status = 0; + cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(infoImpl.platform_id), 0 }; + cl_context clContext = clCreateContext(cps, 1, &infoImpl.device_id, NULL, NULL, &status); + openCLVerifyCall(status); + // TODO add CL_QUEUE_PROFILING_ENABLE + cl_command_queue clCmdQueue = clCreateCommandQueue(clContext, infoImpl.device_id, 0, &status); + openCLVerifyCall(status); + + ContextImpl* ctx = new ContextImpl(infoImpl.info, infoImpl.device_id); + ctx->clCmdQueue = clCmdQueue; + ctx->clContext = clContext; + + ContextImpl* old = NULL; + { + cv::AutoLock lock(currentContextMutex); + old = currentContext; + currentContext = ctx; + } + if (old != NULL) + { + delete old; + } +} + +ContextImpl* ContextImpl::getContext() +{ + return currentContext; +} + +int getOpenCLPlatforms(PlatformsInfo& platforms) +{ + platforms.clear(); + + for (size_t id = 0; id < global_platforms.size(); ++id) + { + PlatformInfoImpl& impl = global_platforms[id]; + platforms.push_back(&impl.info); + } + + return platforms.size(); +} + +int getOpenCLDevices(std::vector &devices, int deviceType, const PlatformInfo* platform) +{ + devices.clear(); + + switch(deviceType) + { + case CVCL_DEVICE_TYPE_DEFAULT: + case CVCL_DEVICE_TYPE_CPU: + case CVCL_DEVICE_TYPE_GPU: + case CVCL_DEVICE_TYPE_ACCELERATOR: + case CVCL_DEVICE_TYPE_ALL: + break; + default: + return 0; + } + + if (platform == NULL) + { + for (size_t id = 0; id < global_devices.size(); ++id) + { + DeviceInfoImpl& deviceInfo = global_devices[id]; + if (((int)deviceInfo.info.deviceType & deviceType) == deviceType) + { + devices.push_back(&deviceInfo.info); + } + } + } + else + { + for (size_t id = 0; id < platform->devices.size(); ++id) + { + const DeviceInfo* deviceInfo = platform->devices[id]; + if (((int)deviceInfo->deviceType & deviceType) == deviceType) + { + devices.push_back(deviceInfo); + } + } + } + + return (int)devices.size(); +} + +void setDevice(const DeviceInfo* info) +{ + ContextImpl::setContext(info); +} + +bool supportsFeature(FEATURE_TYPE featureType) +{ + return Context::getContext()->supportsFeature(featureType); +} + +struct __Module +{ + __Module() { initializeOpenCLDevices(); } + ~__Module() { ContextImpl::cleanupContext(); } +}; +static __Module __module; + + +}//namespace ocl +}//namespace cv + + +#if defined(WIN32) && defined(CVAPI_EXPORTS) + +extern "C" +BOOL WINAPI DllMain(HINSTANCE /*hInst*/, DWORD fdwReason, LPVOID lpReserved) +{ + if (fdwReason == DLL_PROCESS_DETACH) + { + if (lpReserved != NULL) // called after ExitProcess() call + cv::ocl::__termination = true; + } + return TRUE; +} + +#endif diff --git a/modules/ocl/src/cl_operations.cpp b/modules/ocl/src/cl_operations.cpp new file mode 100644 index 0000000000..42138adbe0 --- /dev/null +++ b/modules/ocl/src/cl_operations.cpp @@ -0,0 +1,434 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Guoping Long, longguoping@gmail.com +// Niko Li, newlife20080214@gmail.com +// Yao Wang, bitwangyaoyao@gmail.com +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" +#include +#include +#include "binarycaching.hpp" + +#undef __CL_ENABLE_EXCEPTIONS +#include + +//#define PRINT_KERNEL_RUN_TIME +#define RUN_TIMES 100 +#ifndef CL_MEM_USE_PERSISTENT_MEM_AMD +#define CL_MEM_USE_PERSISTENT_MEM_AMD 0 +#endif +//#define AMD_DOUBLE_DIFFER + +namespace cv { namespace ocl { + +DevMemType gDeviceMemType = DEVICE_MEM_DEFAULT; +DevMemRW gDeviceMemRW = DEVICE_MEM_R_W; +int gDevMemTypeValueMap[5] = {0, + CL_MEM_ALLOC_HOST_PTR, + CL_MEM_USE_HOST_PTR, + CL_MEM_COPY_HOST_PTR, + CL_MEM_USE_PERSISTENT_MEM_AMD}; +int gDevMemRWValueMap[3] = {CL_MEM_READ_WRITE, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY}; + +void finish() +{ + clFinish(getClCommandQueue(Context::getContext())); +} + +bool isCpuDevice() +{ + const DeviceInfo& info = Context::getContext()->getDeviceInfo(); + return (info.deviceType == CVCL_DEVICE_TYPE_CPU); +} + +size_t queryWaveFrontSize(cl_kernel kernel) +{ + const DeviceInfo& info = Context::getContext()->getDeviceInfo(); + if (info.deviceType == CVCL_DEVICE_TYPE_CPU) + return 1; + size_t wavefront = 0; + CV_Assert(kernel != NULL); + openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(Context::getContext()), + CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &wavefront, NULL)); + return wavefront; +} + + +void openCLReadBuffer(Context *ctx, cl_mem dst_buffer, void *host_buffer, size_t size) +{ + cl_int status; + status = clEnqueueReadBuffer(getClCommandQueue(ctx), dst_buffer, CL_TRUE, 0, + size, host_buffer, 0, NULL, NULL); + openCLVerifyCall(status); +} + +cl_mem openCLCreateBuffer(Context *ctx, size_t flag , size_t size) +{ + cl_int status; + cl_mem buffer = clCreateBuffer(getClContext(ctx), (cl_mem_flags)flag, size, NULL, &status); + openCLVerifyCall(status); + return buffer; +} + +void openCLMallocPitch(Context *ctx, void **dev_ptr, size_t *pitch, + size_t widthInBytes, size_t height) +{ + openCLMallocPitchEx(ctx, dev_ptr, pitch, widthInBytes, height, gDeviceMemRW, gDeviceMemType); +} + +void openCLMallocPitchEx(Context *ctx, void **dev_ptr, size_t *pitch, + size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type) +{ + cl_int status; + *dev_ptr = clCreateBuffer(getClContext(ctx), gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type], + widthInBytes * height, 0, &status); + openCLVerifyCall(status); + *pitch = widthInBytes; +} + +void openCLMemcpy2D(Context *ctx, void *dst, size_t dpitch, + const void *src, size_t spitch, + size_t width, size_t height, openCLMemcpyKind kind, int channels) +{ + size_t buffer_origin[3] = {0, 0, 0}; + size_t host_origin[3] = {0, 0, 0}; + size_t region[3] = {width, height, 1}; + if(kind == clMemcpyHostToDevice) + { + if(dpitch == width || channels == 3 || height == 1) + { + openCLSafeCall(clEnqueueWriteBuffer(getClCommandQueue(ctx), (cl_mem)dst, CL_TRUE, + 0, width * height, src, 0, NULL, NULL)); + } + else + { + openCLSafeCall(clEnqueueWriteBufferRect(getClCommandQueue(ctx), (cl_mem)dst, CL_TRUE, + buffer_origin, host_origin, region, dpitch, 0, spitch, 0, src, 0, 0, 0)); + } + } + else if(kind == clMemcpyDeviceToHost) + { + if(spitch == width || channels == 3 || height == 1) + { + openCLSafeCall(clEnqueueReadBuffer(getClCommandQueue(ctx), (cl_mem)src, CL_TRUE, + 0, width * height, dst, 0, NULL, NULL)); + } + else + { + openCLSafeCall(clEnqueueReadBufferRect(getClCommandQueue(ctx), (cl_mem)src, CL_TRUE, + buffer_origin, host_origin, region, spitch, 0, dpitch, 0, dst, 0, 0, 0)); + } + } +} + +void openCLCopyBuffer2D(Context *ctx, void *dst, size_t dpitch, int dst_offset, + const void *src, size_t spitch, + size_t width, size_t height, int src_offset) +{ + size_t src_origin[3] = {src_offset % spitch, src_offset / spitch, 0}; + size_t dst_origin[3] = {dst_offset % dpitch, dst_offset / dpitch, 0}; + size_t region[3] = {width, height, 1}; + + openCLSafeCall(clEnqueueCopyBufferRect(getClCommandQueue(ctx), (cl_mem)src, (cl_mem)dst, src_origin, dst_origin, + region, spitch, 0, dpitch, 0, 0, 0, 0)); +} + +void openCLFree(void *devPtr) +{ + openCLSafeCall(clReleaseMemObject((cl_mem)devPtr)); +} + +cl_kernel openCLGetKernelFromSource(const Context *ctx, const char **source, string kernelName) +{ + return openCLGetKernelFromSource(ctx, source, kernelName, NULL); +} + +cl_kernel openCLGetKernelFromSource(const Context *ctx, const char **source, string kernelName, + const char *build_options) +{ + cl_kernel kernel; + cl_int status = 0; + CV_Assert(ProgramCache::getProgramCache() != NULL); + cl_program program = ProgramCache::getProgramCache()->getProgram(ctx, source, kernelName, build_options); + CV_Assert(program != NULL); + kernel = clCreateKernel(program, kernelName.c_str(), &status); + openCLVerifyCall(status); + return kernel; +} + +void openCLVerifyKernel(const Context *ctx, cl_kernel kernel, size_t *localThreads) +{ + size_t kernelWorkGroupSize; + openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(ctx), + CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0)); + CV_Assert( localThreads[0] <= ctx->getDeviceInfo().maxWorkItemSizes[0] ); + CV_Assert( localThreads[1] <= ctx->getDeviceInfo().maxWorkItemSizes[1] ); + CV_Assert( localThreads[2] <= ctx->getDeviceInfo().maxWorkItemSizes[2] ); + CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= kernelWorkGroupSize ); + CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= ctx->getDeviceInfo().maxWorkGroupSize ); +} + +#ifdef PRINT_KERNEL_RUN_TIME +static double total_execute_time = 0; +static double total_kernel_time = 0; +#endif +void openCLExecuteKernel_(Context *ctx , const char **source, string kernelName, size_t globalThreads[3], + size_t localThreads[3], vector< pair > &args, int channels, + int depth, const char *build_options) +{ + //construct kernel name + //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number + //for exmaple split_C2_D2, represent the split kernel with channels =2 and dataType Depth = 2(Data type is char) + stringstream idxStr; + if(channels != -1) + idxStr << "_C" << channels; + if(depth != -1) + idxStr << "_D" << depth; + kernelName += idxStr.str(); + + cl_kernel kernel; + kernel = openCLGetKernelFromSource(ctx, source, kernelName, build_options); + + if ( localThreads != NULL) + { + globalThreads[0] = roundUp(globalThreads[0], localThreads[0]); + globalThreads[1] = roundUp(globalThreads[1], localThreads[1]); + globalThreads[2] = roundUp(globalThreads[2], localThreads[2]); + + cv::ocl::openCLVerifyKernel(ctx, kernel, localThreads); + } + for(size_t i = 0; i < args.size(); i ++) + openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second)); + +#ifndef PRINT_KERNEL_RUN_TIME + openCLSafeCall(clEnqueueNDRangeKernel(getClCommandQueue(ctx), kernel, 3, NULL, globalThreads, + localThreads, 0, NULL, NULL)); +#else + cl_event event = NULL; + openCLSafeCall(clEnqueueNDRangeKernel(getClCommandQueue(ctx), kernel, 3, NULL, globalThreads, + localThreads, 0, NULL, &event)); + + cl_ulong start_time, end_time, queue_time; + double execute_time = 0; + double total_time = 0; + + openCLSafeCall(clWaitForEvents(1, &event)); + openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, + sizeof(cl_ulong), &start_time, 0)); + + openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, + sizeof(cl_ulong), &end_time, 0)); + + openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED, + sizeof(cl_ulong), &queue_time, 0)); + + execute_time = (double)(end_time - start_time) / (1000 * 1000); + total_time = (double)(end_time - queue_time) / (1000 * 1000); + + total_execute_time += execute_time; + total_kernel_time += total_time; + clReleaseEvent(event); +#endif + + clFlush(getClCommandQueue(ctx)); + openCLSafeCall(clReleaseKernel(kernel)); +} + +void openCLExecuteKernel(Context *ctx , const char **source, string kernelName, + size_t globalThreads[3], size_t localThreads[3], + vector< pair > &args, int channels, int depth) +{ + openCLExecuteKernel(ctx, source, kernelName, globalThreads, localThreads, args, + channels, depth, NULL); +} +void openCLExecuteKernel(Context *ctx , const char **source, string kernelName, + size_t globalThreads[3], size_t localThreads[3], + vector< pair > &args, int channels, int depth, const char *build_options) + +{ +#ifndef PRINT_KERNEL_RUN_TIME + openCLExecuteKernel_(ctx, source, kernelName, globalThreads, localThreads, args, channels, depth, + build_options); +#else + string data_type[] = { "uchar", "char", "ushort", "short", "int", "float", "double"}; + cout << endl; + cout << "Function Name: " << kernelName; + if(depth >= 0) + cout << " |data type: " << data_type[depth]; + cout << " |channels: " << channels; + cout << " |Time Unit: " << "ms" << endl; + + total_execute_time = 0; + total_kernel_time = 0; + cout << "-------------------------------------" << endl; + + cout << setiosflags(ios::left) << setw(15) << "excute time"; + cout << setiosflags(ios::left) << setw(15) << "lauch time"; + cout << setiosflags(ios::left) << setw(15) << "kernel time" << endl; + int i = 0; + for(i = 0; i < RUN_TIMES; i++) + openCLExecuteKernel_(ctx, source, kernelName, globalThreads, localThreads, args, channels, depth, + build_options); + + cout << "average kernel excute time: " << total_execute_time / RUN_TIMES << endl; // "ms" << endl; + cout << "average kernel total time: " << total_kernel_time / RUN_TIMES << endl; // "ms" << endl; +#endif +} + +double openCLExecuteKernelInterop(Context *ctx , const char **source, string kernelName, + size_t globalThreads[3], size_t localThreads[3], + vector< pair > &args, int channels, int depth, const char *build_options, + bool finish, bool measureKernelTime, bool cleanUp) + +{ + //construct kernel name + //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number + //for exmaple split_C2_D2, represent the split kernel with channels =2 and dataType Depth = 2(Data type is char) + stringstream idxStr; + if(channels != -1) + idxStr << "_C" << channels; + if(depth != -1) + idxStr << "_D" << depth; + kernelName += idxStr.str(); + + cl_kernel kernel; + kernel = openCLGetKernelFromSource(ctx, source, kernelName, build_options); + + double kernelTime = 0.0; + + if( globalThreads != NULL) + { + if ( localThreads != NULL) + { + globalThreads[0] = divUp(globalThreads[0], localThreads[0]) * localThreads[0]; + globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1]; + globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2]; + + //size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2]; + cv::ocl::openCLVerifyKernel(ctx, kernel, localThreads); + } + for(size_t i = 0; i < args.size(); i ++) + openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second)); + + if(measureKernelTime == false) + { + openCLSafeCall(clEnqueueNDRangeKernel(getClCommandQueue(ctx), kernel, 3, NULL, globalThreads, + localThreads, 0, NULL, NULL)); + } + else + { + cl_event event = NULL; + openCLSafeCall(clEnqueueNDRangeKernel(getClCommandQueue(ctx), kernel, 3, NULL, globalThreads, + localThreads, 0, NULL, &event)); + + cl_ulong end_time, queue_time; + + openCLSafeCall(clWaitForEvents(1, &event)); + + openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, + sizeof(cl_ulong), &end_time, 0)); + + openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED, + sizeof(cl_ulong), &queue_time, 0)); + + kernelTime = (double)(end_time - queue_time) / (1000 * 1000); + + clReleaseEvent(event); + } + } + + if(finish) + { + clFinish(getClCommandQueue(ctx)); + } + + if(cleanUp) + { + openCLSafeCall(clReleaseKernel(kernel)); + } + + return kernelTime; +} + +//double openCLExecuteKernelInterop(Context *ctx , const char **fileName, const int numFiles, string kernelName, +// size_t globalThreads[3], size_t localThreads[3], +// vector< pair > &args, int channels, int depth, const char *build_options, +// bool finish, bool measureKernelTime, bool cleanUp) +// +//{ +// std::vector fsource; +// for (int i = 0 ; i < numFiles ; i++) +// { +// std::string str; +// if (convertToString(fileName[i], str) >= 0) +// fsource.push_back(str); +// } +// const char **source = new const char *[numFiles]; +// for (int i = 0 ; i < numFiles ; i++) +// source[i] = fsource[i].c_str(); +// double kernelTime = openCLExecuteKernelInterop(ctx ,source, kernelName, globalThreads, localThreads, +// args, channels, depth, build_options, finish, measureKernelTime, cleanUp); +// fsource.clear(); +// delete []source; +// return kernelTime; +//} + +cl_mem load_constant(cl_context context, cl_command_queue command_queue, const void *value, + const size_t size) +{ + int status; + cl_mem con_struct; + + con_struct = clCreateBuffer(context, CL_MEM_READ_ONLY, size, NULL, &status); + openCLSafeCall(status); + + openCLSafeCall(clEnqueueWriteBuffer(command_queue, con_struct, 1, 0, size, + value, 0, 0, 0)); + + return con_struct; + +} + +}//namespace ocl +}//namespace cv diff --git a/modules/ocl/src/cl_programcache.cpp b/modules/ocl/src/cl_programcache.cpp new file mode 100644 index 0000000000..3261319c34 --- /dev/null +++ b/modules/ocl/src/cl_programcache.cpp @@ -0,0 +1,311 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Guoping Long, longguoping@gmail.com +// Niko Li, newlife20080214@gmail.com +// Yao Wang, bitwangyaoyao@gmail.com +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" +#include +#include +#include "binarycaching.hpp" + +#undef __CL_ENABLE_EXCEPTIONS +#include + +namespace cv { namespace ocl { +/* + * The binary caching system to eliminate redundant program source compilation. + * Strictly, this is not a cache because we do not implement evictions right now. + * We shall add such features to trade-off memory consumption and performance when necessary. + */ + +std::auto_ptr _programCache; +ProgramCache* ProgramCache::getProgramCache() +{ + if (NULL == _programCache.get()) + _programCache.reset(new ProgramCache()); + return _programCache.get(); +} + +ProgramCache::ProgramCache() +{ + codeCache.clear(); + cacheSize = 0; +} + +ProgramCache::~ProgramCache() +{ + releaseProgram(); +} + +cl_program ProgramCache::progLookup(string srcsign) +{ + map::iterator iter; + iter = codeCache.find(srcsign); + if(iter != codeCache.end()) + return iter->second; + else + return NULL; +} + +void ProgramCache::addProgram(string srcsign , cl_program program) +{ + if(!progLookup(srcsign)) + { + codeCache.insert(map::value_type(srcsign, program)); + } +} + +void ProgramCache::releaseProgram() +{ + map::iterator iter; + for(iter = codeCache.begin(); iter != codeCache.end(); iter++) + { + openCLSafeCall(clReleaseProgram(iter->second)); + } + codeCache.clear(); + cacheSize = 0; +} + +static int enable_disk_cache = +#ifdef _DEBUG + false; +#else + true; +#endif +static int update_disk_cache = false; +static String binpath = ""; + +void setBinaryDiskCache(int mode, String path) +{ + if(mode == CACHE_NONE) + { + update_disk_cache = 0; + enable_disk_cache = 0; + return; + } + update_disk_cache |= (mode & CACHE_UPDATE) == CACHE_UPDATE; + enable_disk_cache |= +#ifdef _DEBUG + (mode & CACHE_DEBUG) == CACHE_DEBUG; +#else + (mode & CACHE_RELEASE) == CACHE_RELEASE; +#endif + if(enable_disk_cache && !path.empty()) + { + binpath = path; + } +} + +void setBinpath(const char *path) +{ + binpath = path; +} + +int savetofile(const Context*, cl_program &program, const char *fileName) +{ + size_t binarySize; + openCLSafeCall(clGetProgramInfo(program, + CL_PROGRAM_BINARY_SIZES, + sizeof(size_t), + &binarySize, NULL)); + char* binary = (char*)malloc(binarySize); + if(binary == NULL) + { + CV_Error(CV_StsNoMem, "Failed to allocate host memory."); + } + openCLSafeCall(clGetProgramInfo(program, + CL_PROGRAM_BINARIES, + sizeof(char *), + &binary, + NULL)); + + FILE *fp = fopen(fileName, "wb+"); + if(fp != NULL) + { + fwrite(binary, binarySize, 1, fp); + free(binary); + fclose(fp); + } + return 1; +} + +cl_program ProgramCache::getProgram(const Context *ctx, const char **source, string kernelName, + const char *build_options) +{ + cl_program program; + cl_int status = 0; + stringstream src_sign; + string srcsign; + string filename; + + if (NULL != build_options) + { + src_sign << (int64)(*source) << getClContext(ctx) << "_" << build_options; + } + else + { + src_sign << (int64)(*source) << getClContext(ctx); + } + srcsign = src_sign.str(); + + program = NULL; + program = ProgramCache::getProgramCache()->progLookup(srcsign); + + if (!program) + { + //config build programs + std::string all_build_options; + if (!ctx->getDeviceInfo().compilationExtraOptions.empty()) + all_build_options += ctx->getDeviceInfo().compilationExtraOptions; + if (build_options != NULL) + { + all_build_options += " "; + all_build_options += build_options; + } + filename = binpath + kernelName + "_" + ctx->getDeviceInfo().deviceName + all_build_options + ".clb"; + + FILE *fp = enable_disk_cache ? fopen(filename.c_str(), "rb") : NULL; + if(fp == NULL || update_disk_cache) + { + if(fp != NULL) + fclose(fp); + + program = clCreateProgramWithSource( + getClContext(ctx), 1, source, NULL, &status); + openCLVerifyCall(status); + cl_device_id device = getClDeviceID(ctx); + status = clBuildProgram(program, 1, &device, all_build_options.c_str(), NULL, NULL); + if(status == CL_SUCCESS && enable_disk_cache) + savetofile(ctx, program, filename.c_str()); + } + else + { + fseek(fp, 0, SEEK_END); + size_t binarySize = ftell(fp); + fseek(fp, 0, SEEK_SET); + char *binary = new char[binarySize]; + CV_Assert(1 == fread(binary, binarySize, 1, fp)); + fclose(fp); + cl_int status = 0; + cl_device_id device = getClDeviceID(ctx); + program = clCreateProgramWithBinary(getClContext(ctx), + 1, + &device, + (const size_t *)&binarySize, + (const unsigned char **)&binary, + NULL, + &status); + openCLVerifyCall(status); + status = clBuildProgram(program, 1, &device, all_build_options.c_str(), NULL, NULL); + delete[] binary; + } + + if(status != CL_SUCCESS) + { + if(status == CL_BUILD_PROGRAM_FAILURE) + { + cl_int logStatus; + char *buildLog = NULL; + size_t buildLogSize = 0; + logStatus = clGetProgramBuildInfo(program, + getClDeviceID(ctx), CL_PROGRAM_BUILD_LOG, buildLogSize, + buildLog, &buildLogSize); + if(logStatus != CL_SUCCESS) + std::cout << "Failed to build the program and get the build info." << endl; + buildLog = new char[buildLogSize]; + CV_DbgAssert(!!buildLog); + memset(buildLog, 0, buildLogSize); + openCLSafeCall(clGetProgramBuildInfo(program, getClDeviceID(ctx), + CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL)); + std::cout << "\n\t\t\tBUILD LOG\n"; + std::cout << buildLog << endl; + delete [] buildLog; + } + openCLVerifyCall(status); + } + //Cache the binary for future use if build_options is null + if( (this->cacheSize += 1) < MAX_PROG_CACHE_SIZE) + this->addProgram(srcsign, program); + else + cout << "Warning: code cache has been full.\n"; + } + return program; +} + +//// Converts the contents of a file into a string +//static int convertToString(const char *filename, std::string& s) +//{ +// size_t size; +// char* str; +// +// std::fstream f(filename, (std::fstream::in | std::fstream::binary)); +// if(f.is_open()) +// { +// size_t fileSize; +// f.seekg(0, std::fstream::end); +// size = fileSize = (size_t)f.tellg(); +// f.seekg(0, std::fstream::beg); +// +// str = new char[size+1]; +// if(!str) +// { +// f.close(); +// return -1; +// } +// +// f.read(str, fileSize); +// f.close(); +// str[size] = '\0'; +// +// s = str; +// delete[] str; +// return 0; +// } +// printf("Error: Failed to open file %s\n", filename); +// return -1; +//} + +} // namespace ocl +} // namespace cv diff --git a/modules/ocl/src/error.cpp b/modules/ocl/src/error.cpp index e854e70cd0..cd6d3d5346 100644 --- a/modules/ocl/src/error.cpp +++ b/modules/ocl/src/error.cpp @@ -152,19 +152,19 @@ namespace cv case CL_INVALID_GLOBAL_WORK_SIZE: return "CL_INVALID_GLOBAL_WORK_SIZE"; //case CL_INVALID_PROPERTY: - // return "CL_INVALID_PROPERTY"; + // return "CL_INVALID_PROPERTY"; //case CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR: - // return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR"; + // return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR"; //case CL_PLATFORM_NOT_FOUND_KHR: - // return "CL_PLATFORM_NOT_FOUND_KHR"; - // //case CL_INVALID_PROPERTY_EXT: - // // return "CL_INVALID_PROPERTY_EXT"; + // return "CL_PLATFORM_NOT_FOUND_KHR"; + // //case CL_INVALID_PROPERTY_EXT: + // // return "CL_INVALID_PROPERTY_EXT"; //case CL_DEVICE_PARTITION_FAILED_EXT: - // return "CL_DEVICE_PARTITION_FAILED_EXT"; + // return "CL_DEVICE_PARTITION_FAILED_EXT"; //case CL_INVALID_PARTITION_COUNT_EXT: - // return "CL_INVALID_PARTITION_COUNT_EXT"; + // return "CL_INVALID_PARTITION_COUNT_EXT"; //default: - // return "unknown error code"; + // return "unknown error code"; } static char buf[256]; sprintf(buf, "%d", err); diff --git a/modules/ocl/src/fft.cpp b/modules/ocl/src/fft.cpp index b6cc070fb5..c0785ac9d8 100644 --- a/modules/ocl/src/fft.cpp +++ b/modules/ocl/src/fft.cpp @@ -156,25 +156,25 @@ cv::ocl::FftPlan::FftPlan(Size _dft_size, int _src_step, int _dst_step, int _fla { fft_setup(); - bool is_1d_input = (_dft_size.height == 1); - int is_row_dft = flags & DFT_ROWS; + bool is_1d_input = (_dft_size.height == 1); + int is_row_dft = flags & DFT_ROWS; int is_scaled_dft = flags & DFT_SCALE; - int is_inverse = flags & DFT_INVERSE; + int is_inverse = flags & DFT_INVERSE; - //clAmdFftResultLocation place; - clAmdFftLayout inLayout; - clAmdFftLayout outLayout; - clAmdFftDim dim = is_1d_input || is_row_dft ? CLFFT_1D : CLFFT_2D; + //clAmdFftResultLocation place; + clAmdFftLayout inLayout; + clAmdFftLayout outLayout; + clAmdFftDim dim = is_1d_input || is_row_dft ? CLFFT_1D : CLFFT_2D; - size_t batchSize = is_row_dft ? dft_size.height : 1; + size_t batchSize = is_row_dft ? dft_size.height : 1; size_t clLengthsIn[ 3 ] = {1, 1, 1}; size_t clStridesIn[ 3 ] = {1, 1, 1}; //size_t clLengthsOut[ 3 ] = {1, 1, 1}; size_t clStridesOut[ 3 ] = {1, 1, 1}; - clLengthsIn[0] = dft_size.width; - clLengthsIn[1] = is_row_dft ? 1 : dft_size.height; - clStridesIn[0] = 1; - clStridesOut[0] = 1; + clLengthsIn[0] = dft_size.width; + clLengthsIn[1] = is_row_dft ? 1 : dft_size.height; + clStridesIn[0] = 1; + clStridesOut[0] = 1; switch(_type) { @@ -206,7 +206,7 @@ cv::ocl::FftPlan::FftPlan(Size _dft_size, int _src_step, int _dst_step, int _fla clStridesIn[2] = is_row_dft ? clStridesIn[1] : dft_size.width * clStridesIn[1]; clStridesOut[2] = is_row_dft ? clStridesOut[1] : dft_size.width * clStridesOut[1]; - openCLSafeCall( clAmdFftCreateDefaultPlan( &plHandle, *(cl_context*)getoclContext(), dim, clLengthsIn ) ); + openCLSafeCall( clAmdFftCreateDefaultPlan( &plHandle, *(cl_context*)getClContextPtr(), dim, clLengthsIn ) ); openCLSafeCall( clAmdFftSetResultLocation( plHandle, CLFFT_OUTOFPLACE ) ); openCLSafeCall( clAmdFftSetLayout( plHandle, inLayout, outLayout ) ); @@ -220,7 +220,7 @@ cv::ocl::FftPlan::FftPlan(Size _dft_size, int _src_step, int _dst_step, int _fla openCLSafeCall( clAmdFftSetPlanScale ( plHandle, is_inverse ? CLFFT_BACKWARD : CLFFT_FORWARD, scale_ ) ); //ready to bake - openCLSafeCall( clAmdFftBakePlan( plHandle, 1, (cl_command_queue*)getoclCommandQueue(), NULL, NULL ) ); + openCLSafeCall( clAmdFftBakePlan( plHandle, 1, (cl_command_queue*)getClCommandQueuePtr(), NULL, NULL ) ); } cv::ocl::FftPlan::~FftPlan() { @@ -296,12 +296,12 @@ void cv::ocl::dft(const oclMat &src, oclMat &dst, Size dft_size, int flags) // similar assertions with cuda module CV_Assert(src.type() == CV_32F || src.type() == CV_32FC2); - //bool is_1d_input = (src.rows == 1); - //int is_row_dft = flags & DFT_ROWS; - //int is_scaled_dft = flags & DFT_SCALE; - int is_inverse = flags & DFT_INVERSE; - bool is_complex_input = src.channels() == 2; - bool is_complex_output = !(flags & DFT_REAL_OUTPUT); + //bool is_1d_input = (src.rows == 1); + //int is_row_dft = flags & DFT_ROWS; + //int is_scaled_dft = flags & DFT_SCALE; + int is_inverse = flags & DFT_INVERSE; + bool is_complex_input = src.channels() == 2; + bool is_complex_output = !(flags & DFT_REAL_OUTPUT); // We don't support real-to-real transform @@ -338,10 +338,10 @@ void cv::ocl::dft(const oclMat &src, oclMat &dst, Size dft_size, int flags) if (buffersize) { cl_int medstatus; - clMedBuffer = clCreateBuffer ( (cl_context)src.clCxt->oclContext(), CL_MEM_READ_WRITE, buffersize, 0, &medstatus); + clMedBuffer = clCreateBuffer ( *(cl_context*)(src.clCxt->getOpenCLContextPtr()), CL_MEM_READ_WRITE, buffersize, 0, &medstatus); openCLSafeCall( medstatus ); } - cl_command_queue clq = (cl_command_queue)src.clCxt->oclCommandQueue(); + cl_command_queue clq = *(cl_command_queue*)(src.clCxt->getOpenCLCommandQueuePtr()); openCLSafeCall( clAmdFftEnqueueTransform( plHandle, is_inverse ? CLFFT_BACKWARD : CLFFT_FORWARD, 1, diff --git a/modules/ocl/src/filtering.cpp b/modules/ocl/src/filtering.cpp index 284dc61632..caaf53d849 100644 --- a/modules/ocl/src/filtering.cpp +++ b/modules/ocl/src/filtering.cpp @@ -1430,7 +1430,7 @@ void cv::ocl::Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy, void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, double scale) { - if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F) + if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.type() == CV_64F) { CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n"); return; diff --git a/modules/ocl/src/gemm.cpp b/modules/ocl/src/gemm.cpp index ec03c2f932..687f26f632 100644 --- a/modules/ocl/src/gemm.cpp +++ b/modules/ocl/src/gemm.cpp @@ -134,7 +134,7 @@ void cv::ocl::gemm(const oclMat &src1, const oclMat &src2, double alpha, int offb = src2.offset; int offc = dst.offset; - cl_command_queue clq = (cl_command_queue)src1.clCxt->oclCommandQueue(); + cl_command_queue clq = *(cl_command_queue*)src1.clCxt->getOpenCLCommandQueuePtr(); switch(src1.type()) { case CV_32FC1: diff --git a/modules/ocl/src/gftt.cpp b/modules/ocl/src/gftt.cpp index 37ebaafa38..29a96ae658 100644 --- a/modules/ocl/src/gftt.cpp +++ b/modules/ocl/src/gftt.cpp @@ -338,7 +338,7 @@ void cv::ocl::GoodFeaturesToTrackDetector_OCL::downloadPoints(const oclMat &poin CV_DbgAssert(points.type() == CV_32FC2); points_v.resize(points.cols); openCLSafeCall(clEnqueueReadBuffer( - *reinterpret_cast(getoclCommandQueue()), + *(cl_command_queue*)getClCommandQueuePtr(), reinterpret_cast(points.data), CL_TRUE, 0, diff --git a/modules/ocl/src/haar.cpp b/modules/ocl/src/haar.cpp index 212fd2c444..e3e73b3c3d 100644 --- a/modules/ocl/src/haar.cpp +++ b/modules/ocl/src/haar.cpp @@ -745,7 +745,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS if( gimg.cols < minSize.width || gimg.rows < minSize.height ) CV_Error(CV_StsError, "Image too small"); - cl_command_queue qu = reinterpret_cast(Context::getContext()->oclCommandQueue()); + cl_command_queue qu = getClCommandQueue(Context::getContext()); if( (flags & CV_HAAR_SCALE_IMAGE) ) { CvSize winSize0 = cascade->orig_window_size; @@ -788,7 +788,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS size_t blocksize = 8; size_t localThreads[3] = { blocksize, blocksize , 1 }; - size_t globalThreads[3] = { grp_per_CU *(gsum.clCxt->computeUnits()) *localThreads[0], + size_t globalThreads[3] = { grp_per_CU *(gsum.clCxt->getDeviceInfo().maxComputeUnits) *localThreads[0], localThreads[1], 1 }; int outputsz = 256 * globalThreads[0] / localThreads[0]; @@ -949,7 +949,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS int grp_per_CU = 12; size_t blocksize = 8; size_t localThreads[3] = { blocksize, blocksize , 1 }; - size_t globalThreads[3] = { grp_per_CU *gsum.clCxt->computeUnits() *localThreads[0], + size_t globalThreads[3] = { grp_per_CU *gsum.clCxt->getDeviceInfo().maxComputeUnits *localThreads[0], localThreads[1], 1 }; int outputsz = 256 * globalThreads[0] / localThreads[0]; int nodenum = (datasize - sizeof(GpuHidHaarClassifierCascade) - @@ -1120,7 +1120,7 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std int blocksize = 8; int grp_per_CU = 12; size_t localThreads[3] = { blocksize, blocksize, 1 }; - size_t globalThreads[3] = { grp_per_CU * cv::ocl::Context::getContext()->computeUnits() *localThreads[0], + size_t globalThreads[3] = { grp_per_CU * cv::ocl::Context::getContext()->getDeviceInfo().maxComputeUnits *localThreads[0], localThreads[1], 1 }; int outputsz = 256 * globalThreads[0] / localThreads[0]; @@ -1148,7 +1148,7 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std } int *candidate; - cl_command_queue qu = reinterpret_cast(Context::getContext()->oclCommandQueue()); + cl_command_queue qu = getClCommandQueue(Context::getContext()); if( (flags & CV_HAAR_SCALE_IMAGE) ) { int indexy = 0; @@ -1340,7 +1340,7 @@ void cv::ocl::OclCascadeClassifierBuf::Init(const int rows, const int cols, GpuHidHaarStageClassifier *stage; GpuHidHaarClassifier *classifier; GpuHidHaarTreeNode *node; - cl_command_queue qu = reinterpret_cast(Context::getContext()->oclCommandQueue()); + cl_command_queue qu = getClCommandQueue(Context::getContext()); if( (flags & CV_HAAR_SCALE_IMAGE) ) { gcascade = (GpuHidHaarClassifierCascade *)(cascade->hid_cascade); @@ -1505,7 +1505,7 @@ void cv::ocl::OclCascadeClassifierBuf::CreateFactorRelatedBufs( CvSize sz; CvSize winSize0 = oldCascade->orig_window_size; detect_piramid_info *scaleinfo; - cl_command_queue qu = reinterpret_cast(Context::getContext()->oclCommandQueue()); + cl_command_queue qu = getClCommandQueue(Context::getContext()); if (flags & CV_HAAR_SCALE_IMAGE) { for(factor = 1.f;; factor *= scaleFactor) diff --git a/modules/ocl/src/hog.cpp b/modules/ocl/src/hog.cpp index 55872829a9..563172bc13 100644 --- a/modules/ocl/src/hog.cpp +++ b/modules/ocl/src/hog.cpp @@ -157,7 +157,7 @@ cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size blo effect_size = Size(0, 0); - if (queryDeviceInfo()) + if (isCpuDevice()) hog_device_cpu = true; else hog_device_cpu = false; @@ -1670,9 +1670,9 @@ void cv::ocl::device::hog::compute_hists(int nbins, else { cl_kernel kernel = openCLGetKernelFromSource(clCxt, &objdetect_hog, kernelName); - int wave_size = queryDeviceInfo(kernel); + size_t wave_size = queryWaveFrontSize(kernel); char opt[32] = {0}; - sprintf(opt, "-D WAVE_SIZE=%d", wave_size); + sprintf(opt, "-D WAVE_SIZE=%d", (int)wave_size); openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1, opt); } @@ -1734,9 +1734,9 @@ void cv::ocl::device::hog::normalize_hists(int nbins, else { cl_kernel kernel = openCLGetKernelFromSource(clCxt, &objdetect_hog, kernelName); - int wave_size = queryDeviceInfo(kernel); + size_t wave_size = queryWaveFrontSize(kernel); char opt[32] = {0}; - sprintf(opt, "-D WAVE_SIZE=%d", wave_size); + sprintf(opt, "-D WAVE_SIZE=%d", (int)wave_size); openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1, opt); } @@ -1803,9 +1803,9 @@ void cv::ocl::device::hog::classify_hists(int win_height, int win_width, else { cl_kernel kernel = openCLGetKernelFromSource(clCxt, &objdetect_hog, kernelName); - int wave_size = queryDeviceInfo(kernel); + size_t wave_size = queryWaveFrontSize(kernel); char opt[32] = {0}; - sprintf(opt, "-D WAVE_SIZE=%d", wave_size); + sprintf(opt, "-D WAVE_SIZE=%d", (int)wave_size); openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1, opt); } diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp index 7d0d941dfa..0949605e15 100644 --- a/modules/ocl/src/imgproc.cpp +++ b/modules/ocl/src/imgproc.cpp @@ -289,7 +289,7 @@ namespace cv args.push_back( make_pair(sizeof(cl_int), (void *)&map1.rows)); args.push_back( make_pair(sizeof(cl_int), (void *)&cols)); - if(src.clCxt->supportsFeature(Context::CL_DOUBLE)) + if(src.clCxt->supportsFeature(FEATURE_CL_DOUBLE)) { args.push_back( make_pair(sizeof(cl_double4), (void *)&borderValue)); } @@ -317,7 +317,7 @@ namespace cv args.push_back( make_pair(sizeof(cl_int), (void *)&map1.cols)); args.push_back( make_pair(sizeof(cl_int), (void *)&map1.rows)); args.push_back( make_pair(sizeof(cl_int), (void *)&cols)); - if(src.clCxt->supportsFeature(Context::CL_DOUBLE)) + if(src.clCxt->supportsFeature(FEATURE_CL_DOUBLE)) { args.push_back( make_pair(sizeof(cl_double4), (void *)&borderValue)); } @@ -380,7 +380,7 @@ namespace cv args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows)); args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols)); args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows)); - if(src.clCxt->supportsFeature(Context::CL_DOUBLE)) + if(src.clCxt->supportsFeature(FEATURE_CL_DOUBLE)) { args.push_back( make_pair(sizeof(cl_double), (void *)&ifx_d)); args.push_back( make_pair(sizeof(cl_double), (void *)&ify_d)); @@ -802,12 +802,12 @@ namespace cv string kernelName = "warpAffine" + s[interpolation]; - if(src.clCxt->supportsFeature(Context::CL_DOUBLE)) + if(src.clCxt->supportsFeature(FEATURE_CL_DOUBLE)) { cl_int st; - coeffs_cm = clCreateBuffer( (cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(F) * 2 * 3, NULL, &st ); + coeffs_cm = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, sizeof(F) * 2 * 3, NULL, &st ); openCLVerifyCall(st); - openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(F) * 2 * 3, coeffs, 0, 0, 0)); + openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(), (cl_mem)coeffs_cm, 1, 0, sizeof(F) * 2 * 3, coeffs, 0, 0, 0)); } else { @@ -817,8 +817,8 @@ namespace cv { float_coeffs[m][n] = coeffs[m][n]; } - coeffs_cm = clCreateBuffer( (cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(float) * 2 * 3, NULL, &st ); - openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 2 * 3, float_coeffs, 0, 0, 0)); + coeffs_cm = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, sizeof(float) * 2 * 3, NULL, &st ); + openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(), (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 2 * 3, float_coeffs, 0, 0, 0)); } //TODO: improve this kernel @@ -872,12 +872,12 @@ namespace cv string s[3] = {"NN", "Linear", "Cubic"}; string kernelName = "warpPerspective" + s[interpolation]; - if(src.clCxt->supportsFeature(Context::CL_DOUBLE)) + if(src.clCxt->supportsFeature(FEATURE_CL_DOUBLE)) { cl_int st; - coeffs_cm = clCreateBuffer((cl_context) clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(double) * 3 * 3, NULL, &st ); + coeffs_cm = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, sizeof(double) * 3 * 3, NULL, &st ); openCLVerifyCall(st); - openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(double) * 3 * 3, coeffs, 0, 0, 0)); + openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(), (cl_mem)coeffs_cm, 1, 0, sizeof(double) * 3 * 3, coeffs, 0, 0, 0)); } else { @@ -886,9 +886,9 @@ namespace cv for(int n = 0; n < 3; n++) float_coeffs[m][n] = coeffs[m][n]; - coeffs_cm = clCreateBuffer((cl_context) clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(float) * 3 * 3, NULL, &st ); + coeffs_cm = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, sizeof(float) * 3 * 3, NULL, &st ); openCLVerifyCall(st); - openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 3 * 3, float_coeffs, 0, 0, 0)); + openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(), (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 3 * 3, float_coeffs, 0, 0, 0)); } //TODO: improve this kernel size_t blkSizeX = 16, blkSizeY = 16; @@ -994,7 +994,7 @@ namespace cv void integral(const oclMat &src, oclMat &sum, oclMat &sqsum) { CV_Assert(src.type() == CV_8UC1); - if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F) + if(!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F) { CV_Error(CV_GpuNotSupported, "select device don't support double"); } @@ -1192,7 +1192,7 @@ namespace cv void cornerHarris_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize, double k, int borderType) { - if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F) + if(!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F) { CV_Error(CV_GpuNotSupported, "select device don't support double"); } @@ -1211,7 +1211,7 @@ namespace cv void cornerMinEigenVal_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize, int borderType) { - if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F) + if(!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F) { CV_Error(CV_GpuNotSupported, "select device don't support double"); } @@ -1512,17 +1512,17 @@ namespace cv String kernelName = "calcLut"; size_t localThreads[3] = { 32, 8, 1 }; size_t globalThreads[3] = { tilesX * localThreads[0], tilesY * localThreads[1], 1 }; - bool is_cpu = queryDeviceInfo(); + bool is_cpu = isCpuDevice(); if (is_cpu) openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1, (char*)" -D CPU"); else { cl_kernel kernel = openCLGetKernelFromSource(Context::getContext(), &imgproc_clahe, kernelName); - int wave_size = queryDeviceInfo(kernel); + size_t wave_size = queryWaveFrontSize(kernel); openCLSafeCall(clReleaseKernel(kernel)); static char opt[20] = {0}; - sprintf(opt, " -D WAVE_SIZE=%d", wave_size); + sprintf(opt, " -D WAVE_SIZE=%d", (int)wave_size); openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1, opt); } } diff --git a/modules/ocl/src/initialization.cpp b/modules/ocl/src/initialization.cpp deleted file mode 100644 index c18984b078..0000000000 --- a/modules/ocl/src/initialization.cpp +++ /dev/null @@ -1,1090 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Guoping Long, longguoping@gmail.com -// Niko Li, newlife20080214@gmail.com -// Yao Wang, bitwangyaoyao@gmail.com -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other oclMaterials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#include "precomp.hpp" -#include -#include -#include "binarycaching.hpp" - -using namespace cv; -using namespace cv::ocl; -using namespace std; -using std::cout; -using std::endl; - -//#define PRINT_KERNEL_RUN_TIME -#define RUN_TIMES 100 -#ifndef CL_MEM_USE_PERSISTENT_MEM_AMD -#define CL_MEM_USE_PERSISTENT_MEM_AMD 0 -#endif -//#define AMD_DOUBLE_DIFFER - -namespace cv -{ - namespace ocl - { - extern void fft_teardown(); - extern void clBlasTeardown(); - /* - * The binary caching system to eliminate redundant program source compilation. - * Strictly, this is not a cache because we do not implement evictions right now. - * We shall add such features to trade-off memory consumption and performance when necessary. - */ - auto_ptr ProgramCache::programCache; - ProgramCache *programCache = NULL; - DevMemType gDeviceMemType = DEVICE_MEM_DEFAULT; - DevMemRW gDeviceMemRW = DEVICE_MEM_R_W; - int gDevMemTypeValueMap[5] = {0, - CL_MEM_ALLOC_HOST_PTR, - CL_MEM_USE_HOST_PTR, - CL_MEM_COPY_HOST_PTR, - CL_MEM_USE_PERSISTENT_MEM_AMD}; - int gDevMemRWValueMap[3] = {CL_MEM_READ_WRITE, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY}; - - ProgramCache::ProgramCache() - { - codeCache.clear(); - cacheSize = 0; - } - - ProgramCache::~ProgramCache() - { - releaseProgram(); - } - - cl_program ProgramCache::progLookup(string srcsign) - { - map::iterator iter; - iter = codeCache.find(srcsign); - if(iter != codeCache.end()) - return iter->second; - else - return NULL; - } - - void ProgramCache::addProgram(string srcsign , cl_program program) - { - if(!progLookup(srcsign)) - { - codeCache.insert(map::value_type(srcsign, program)); - } - } - - void ProgramCache::releaseProgram() - { - map::iterator iter; - for(iter = codeCache.begin(); iter != codeCache.end(); iter++) - { - openCLSafeCall(clReleaseProgram(iter->second)); - } - codeCache.clear(); - cacheSize = 0; - } - struct Info::Impl - { - cl_platform_id oclplatform; - std::vector devices; - std::vector devName; - std::string clVersion; - - cl_context oclcontext; - cl_command_queue clCmdQueue; - int devnum; - size_t maxWorkGroupSize; - cl_uint maxDimensions; // == maxWorkItemSizes.size() - std::vector maxWorkItemSizes; - cl_uint maxComputeUnits; - char extra_options[512]; - int double_support; - int unified_memory; //1 means integrated GPU, otherwise this value is 0 - int refcounter; - - Impl(); - - void setDevice(void *ctx, void *q, int devnum); - - void release() - { - if(1 == CV_XADD(&refcounter, -1)) - { - releaseResources(); - delete this; - } - } - - Impl* copy() - { - CV_XADD(&refcounter, 1); - return this; - } - - private: - Impl(const Impl&); - Impl& operator=(const Impl&); - void releaseResources(); - }; - - // global variables to hold binary cache properties - static int enable_disk_cache = -#ifdef _DEBUG - false; -#else - true; -#endif - static int update_disk_cache = false; - static String binpath = ""; - - Info::Impl::Impl() - :oclplatform(0), - oclcontext(0), - clCmdQueue(0), - devnum(-1), - maxWorkGroupSize(0), - maxDimensions(0), - maxComputeUnits(0), - double_support(0), - unified_memory(0), - refcounter(1) - { - memset(extra_options, 0, 512); - } - - void Info::Impl::releaseResources() - { - devnum = -1; - - if(clCmdQueue) - { - //temporarily disable command queue release as it causes program hang at exit - //openCLSafeCall(clReleaseCommandQueue(clCmdQueue)); - clCmdQueue = 0; - } - - if(oclcontext) - { - openCLSafeCall(clReleaseContext(oclcontext)); - oclcontext = 0; - } - } - - void Info::Impl::setDevice(void *ctx, void *q, int dnum) - { - if((ctx && q) || devnum != dnum) - releaseResources(); - - CV_Assert(dnum >= 0 && dnum < (int)devices.size()); - devnum = dnum; - if(ctx && q) - { - oclcontext = (cl_context)ctx; - clCmdQueue = (cl_command_queue)q; - clRetainContext(oclcontext); - clRetainCommandQueue(clCmdQueue); - } - else - { - cl_int status = 0; - cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(oclplatform), 0 }; - oclcontext = clCreateContext(cps, 1, &devices[devnum], 0, 0, &status); - openCLVerifyCall(status); - clCmdQueue = clCreateCommandQueue(oclcontext, devices[devnum], CL_QUEUE_PROFILING_ENABLE, &status); - openCLVerifyCall(status); - } - - openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&maxWorkGroupSize, 0)); - openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), (void *)&maxDimensions, 0)); - maxWorkItemSizes.resize(maxDimensions); - openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*maxDimensions, (void *)&maxWorkItemSizes[0], 0)); - openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), (void *)&maxComputeUnits, 0)); - - cl_bool unfymem = false; - openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), (void *)&unfymem, 0)); - unified_memory = unfymem ? 1 : 0; - - //initialize extra options for compilation. Currently only fp64 is included. - //Assume 4KB is enough to store all possible extensions. - const int EXT_LEN = 4096 + 1 ; - char extends_set[EXT_LEN]; - size_t extends_size; - openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_EXTENSIONS, EXT_LEN, (void *)extends_set, &extends_size)); - extends_set[EXT_LEN - 1] = 0; - size_t fp64_khr = std::string(extends_set).find("cl_khr_fp64"); - - if(fp64_khr != std::string::npos) - { - sprintf(extra_options, "-D DOUBLE_SUPPORT"); - double_support = 1; - } - else - { - memset(extra_options, 0, 512); - double_support = 0; - } - } - - ////////////////////////Common OpenCL specific calls/////////////// - int getDevMemType(DevMemRW& rw_type, DevMemType& mem_type) - { - rw_type = gDeviceMemRW; - mem_type = gDeviceMemType; - return Context::getContext()->impl->unified_memory; - } - - int setDevMemType(DevMemRW rw_type, DevMemType mem_type) - { - if( (mem_type == DEVICE_MEM_PM && Context::getContext()->impl->unified_memory == 0) || - mem_type == DEVICE_MEM_UHP || - mem_type == DEVICE_MEM_CHP ) - return -1; - gDeviceMemRW = rw_type; - gDeviceMemType = mem_type; - return 0; - } - - int getDevice(std::vector &oclinfo, int devicetype) - { - //TODO: cache oclinfo vector - oclinfo.clear(); - - switch(devicetype) - { - case CVCL_DEVICE_TYPE_DEFAULT: - case CVCL_DEVICE_TYPE_CPU: - case CVCL_DEVICE_TYPE_GPU: - case CVCL_DEVICE_TYPE_ACCELERATOR: - case CVCL_DEVICE_TYPE_ALL: - break; - default: - return 0; - } - - // Platform info - cl_uint numPlatforms; - openCLSafeCall(clGetPlatformIDs(0, 0, &numPlatforms)); - if(numPlatforms < 1) return 0; - - std::vector platforms(numPlatforms); - openCLSafeCall(clGetPlatformIDs(numPlatforms, &platforms[0], 0)); - - char deviceName[256]; - int devcienums = 0; - char clVersion[256]; - for (unsigned i = 0; i < numPlatforms; ++i) - { - cl_uint numsdev = 0; - cl_int status = clGetDeviceIDs(platforms[i], devicetype, 0, NULL, &numsdev); - if(status != CL_DEVICE_NOT_FOUND) - openCLVerifyCall(status); - - if(numsdev > 0) - { - devcienums += numsdev; - std::vector devices(numsdev); - openCLSafeCall(clGetDeviceIDs(platforms[i], devicetype, numsdev, &devices[0], 0)); - - Info ocltmpinfo; - ocltmpinfo.impl->oclplatform = platforms[i]; - openCLSafeCall(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, sizeof(clVersion), clVersion, NULL)); - ocltmpinfo.impl->clVersion = clVersion; - for(unsigned j = 0; j < numsdev; ++j) - { - ocltmpinfo.impl->devices.push_back(devices[j]); - openCLSafeCall(clGetDeviceInfo(devices[j], CL_DEVICE_NAME, sizeof(deviceName), deviceName, 0)); - ocltmpinfo.impl->devName.push_back(deviceName); - ocltmpinfo.DeviceName.push_back(deviceName); - } - oclinfo.push_back(ocltmpinfo); - } - } - if(devcienums > 0) - { - setDevice(oclinfo[0]); - } - return devcienums; - } - - void setDevice(Info &oclinfo, int devnum) - { - oclinfo.impl->setDevice(0, 0, devnum); - Context::setContext(oclinfo); - } - - void setDeviceEx(Info &oclinfo, void *ctx, void *q, int devnum) - { - oclinfo.impl->setDevice(ctx, q, devnum); - Context::setContext(oclinfo); - } - - void *getoclContext() - { - return &(Context::getContext()->impl->oclcontext); - } - - void *getoclCommandQueue() - { - return &(Context::getContext()->impl->clCmdQueue); - } - - void finish() - { - clFinish(Context::getContext()->impl->clCmdQueue); - } - - //template specializations of queryDeviceInfo - template<> - bool queryDeviceInfo(cl_kernel) - { - Info::Impl* impl = Context::getContext()->impl; - cl_device_type devicetype; - openCLSafeCall(clGetDeviceInfo(impl->devices[impl->devnum], - CL_DEVICE_TYPE, sizeof(cl_device_type), - &devicetype, NULL)); - return (devicetype == CVCL_DEVICE_TYPE_CPU); - } - - template - static _ty queryWavesize(cl_kernel kernel) - { - size_t info = 0; - Info::Impl* impl = Context::getContext()->impl; - bool is_cpu = queryDeviceInfo(); - if(is_cpu) - { - return 1; - } - CV_Assert(kernel != NULL); - openCLSafeCall(clGetKernelWorkGroupInfo(kernel, impl->devices[impl->devnum], - CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &info, NULL)); - return static_cast<_ty>(info); - } - - template<> - size_t queryDeviceInfo(cl_kernel kernel) - { - return queryWavesize(kernel); - } - template<> - int queryDeviceInfo(cl_kernel kernel) - { - return queryWavesize(kernel); - } - - void openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size) - { - cl_int status; - status = clEnqueueReadBuffer(clCxt->impl->clCmdQueue, dst_buffer, CL_TRUE, 0, - size, host_buffer, 0, NULL, NULL); - openCLVerifyCall(status); - } - - cl_mem openCLCreateBuffer(Context *clCxt, size_t flag , size_t size) - { - cl_int status; - cl_mem buffer = clCreateBuffer(clCxt->impl->oclcontext, (cl_mem_flags)flag, size, NULL, &status); - openCLVerifyCall(status); - return buffer; - } - - void openCLMallocPitch(Context *clCxt, void **dev_ptr, size_t *pitch, - size_t widthInBytes, size_t height) - { - openCLMallocPitchEx(clCxt, dev_ptr, pitch, widthInBytes, height, gDeviceMemRW, gDeviceMemType); - } - - void openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch, - size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type) - { - cl_int status; - *dev_ptr = clCreateBuffer(clCxt->impl->oclcontext, gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type], - widthInBytes * height, 0, &status); - openCLVerifyCall(status); - *pitch = widthInBytes; - } - - void openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch, - const void *src, size_t spitch, - size_t width, size_t height, openCLMemcpyKind kind, int channels) - { - size_t buffer_origin[3] = {0, 0, 0}; - size_t host_origin[3] = {0, 0, 0}; - size_t region[3] = {width, height, 1}; - if(kind == clMemcpyHostToDevice) - { - if(dpitch == width || channels == 3 || height == 1) - { - openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, (cl_mem)dst, CL_TRUE, - 0, width * height, src, 0, NULL, NULL)); - } - else - { - openCLSafeCall(clEnqueueWriteBufferRect(clCxt->impl->clCmdQueue, (cl_mem)dst, CL_TRUE, - buffer_origin, host_origin, region, dpitch, 0, spitch, 0, src, 0, 0, 0)); - } - } - else if(kind == clMemcpyDeviceToHost) - { - if(spitch == width || channels == 3 || height == 1) - { - openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)src, CL_TRUE, - 0, width * height, dst, 0, NULL, NULL)); - } - else - { - openCLSafeCall(clEnqueueReadBufferRect(clCxt->impl->clCmdQueue, (cl_mem)src, CL_TRUE, - buffer_origin, host_origin, region, spitch, 0, dpitch, 0, dst, 0, 0, 0)); - } - } - } - - void openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset, - const void *src, size_t spitch, - size_t width, size_t height, int src_offset) - { - size_t src_origin[3] = {src_offset % spitch, src_offset / spitch, 0}; - size_t dst_origin[3] = {dst_offset % dpitch, dst_offset / dpitch, 0}; - size_t region[3] = {width, height, 1}; - - openCLSafeCall(clEnqueueCopyBufferRect(clCxt->impl->clCmdQueue, (cl_mem)src, (cl_mem)dst, src_origin, dst_origin, - region, spitch, 0, dpitch, 0, 0, 0, 0)); - } - - void openCLFree(void *devPtr) - { - openCLSafeCall(clReleaseMemObject((cl_mem)devPtr)); - } - cl_kernel openCLGetKernelFromSource(const Context *clCxt, const char **source, string kernelName) - { - return openCLGetKernelFromSource(clCxt, source, kernelName, NULL); - } - - void setBinaryDiskCache(int mode, String path) - { - if(mode == CACHE_NONE) - { - update_disk_cache = 0; - enable_disk_cache = 0; - return; - } - update_disk_cache |= (mode & CACHE_UPDATE) == CACHE_UPDATE; - enable_disk_cache |= -#ifdef _DEBUG - (mode & CACHE_DEBUG) == CACHE_DEBUG; -#else - (mode & CACHE_RELEASE) == CACHE_RELEASE; -#endif - if(enable_disk_cache && !path.empty()) - { - binpath = path; - } - } - - void setBinpath(const char *path) - { - binpath = path; - } - - int savetofile(const Context*, cl_program &program, const char *fileName) - { - size_t binarySize; - openCLSafeCall(clGetProgramInfo(program, - CL_PROGRAM_BINARY_SIZES, - sizeof(size_t), - &binarySize, NULL)); - char* binary = (char*)malloc(binarySize); - if(binary == NULL) - { - CV_Error(CV_StsNoMem, "Failed to allocate host memory."); - } - openCLSafeCall(clGetProgramInfo(program, - CL_PROGRAM_BINARIES, - sizeof(char *), - &binary, - NULL)); - - FILE *fp = fopen(fileName, "wb+"); - if(fp != NULL) - { - fwrite(binary, binarySize, 1, fp); - free(binary); - fclose(fp); - } - return 1; - } - - cl_kernel openCLGetKernelFromSource(const Context *clCxt, const char **source, string kernelName, - const char *build_options) - { - cl_kernel kernel; - cl_program program ; - cl_int status = 0; - stringstream src_sign; - string srcsign; - string filename; - CV_Assert(programCache != NULL); - - if(NULL != build_options) - { - src_sign << (int64)(*source) << clCxt->impl->oclcontext << "_" << build_options; - } - else - { - src_sign << (int64)(*source) << clCxt->impl->oclcontext; - } - srcsign = src_sign.str(); - - program = NULL; - program = programCache->progLookup(srcsign); - - if(!program) - { - //config build programs - char all_build_options[1024]; - memset(all_build_options, 0, 1024); - char zeromem[512] = {0}; - if(0 != memcmp(clCxt -> impl->extra_options, zeromem, 512)) - strcat(all_build_options, clCxt -> impl->extra_options); - strcat(all_build_options, " "); - if(build_options != NULL) - strcat(all_build_options, build_options); - if(all_build_options != NULL) - { - filename = binpath + kernelName + "_" + clCxt->impl->devName[clCxt->impl->devnum] + all_build_options + ".clb"; - } - else - { - filename = binpath + kernelName + "_" + clCxt->impl->devName[clCxt->impl->devnum] + ".clb"; - } - - FILE *fp = enable_disk_cache ? fopen(filename.c_str(), "rb") : NULL; - if(fp == NULL || update_disk_cache) - { - if(fp != NULL) - fclose(fp); - - program = clCreateProgramWithSource( - clCxt->impl->oclcontext, 1, source, NULL, &status); - openCLVerifyCall(status); - status = clBuildProgram(program, 1, &(clCxt->impl->devices[clCxt->impl->devnum]), all_build_options, NULL, NULL); - if(status == CL_SUCCESS && enable_disk_cache) - savetofile(clCxt, program, filename.c_str()); - } - else - { - fseek(fp, 0, SEEK_END); - size_t binarySize = ftell(fp); - fseek(fp, 0, SEEK_SET); - char *binary = new char[binarySize]; - CV_Assert(1 == fread(binary, binarySize, 1, fp)); - fclose(fp); - cl_int status = 0; - program = clCreateProgramWithBinary(clCxt->impl->oclcontext, - 1, - &(clCxt->impl->devices[clCxt->impl->devnum]), - (const size_t *)&binarySize, - (const unsigned char **)&binary, - NULL, - &status); - openCLVerifyCall(status); - status = clBuildProgram(program, 1, &(clCxt->impl->devices[clCxt->impl->devnum]), all_build_options, NULL, NULL); - delete[] binary; - } - - if(status != CL_SUCCESS) - { - if(status == CL_BUILD_PROGRAM_FAILURE) - { - cl_int logStatus; - char *buildLog = NULL; - size_t buildLogSize = 0; - logStatus = clGetProgramBuildInfo(program, - clCxt->impl->devices[clCxt->impl->devnum], CL_PROGRAM_BUILD_LOG, buildLogSize, - buildLog, &buildLogSize); - if(logStatus != CL_SUCCESS) - cout << "Failed to build the program and get the build info." << endl; - buildLog = new char[buildLogSize]; - CV_DbgAssert(!!buildLog); - memset(buildLog, 0, buildLogSize); - openCLSafeCall(clGetProgramBuildInfo(program, clCxt->impl->devices[clCxt->impl->devnum], - CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL)); - cout << "\n\t\t\tBUILD LOG\n"; - cout << buildLog << endl; - delete [] buildLog; - } - openCLVerifyCall(status); - } - //Cache the binary for future use if build_options is null - if( (programCache->cacheSize += 1) < programCache->MAX_PROG_CACHE_SIZE) - programCache->addProgram(srcsign, program); - else - cout << "Warning: code cache has been full.\n"; - } - kernel = clCreateKernel(program, kernelName.c_str(), &status); - openCLVerifyCall(status); - return kernel; - } - - void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads) - { - size_t kernelWorkGroupSize; - openCLSafeCall(clGetKernelWorkGroupInfo(kernel, clCxt->impl->devices[clCxt->impl->devnum], - CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0)); - CV_Assert( localThreads[0] <= clCxt->impl->maxWorkItemSizes[0] ); - CV_Assert( localThreads[1] <= clCxt->impl->maxWorkItemSizes[1] ); - CV_Assert( localThreads[2] <= clCxt->impl->maxWorkItemSizes[2] ); - CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= kernelWorkGroupSize ); - CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= clCxt->impl->maxWorkGroupSize ); - } - - static inline size_t roundUp(size_t sz, size_t n) - { - // we don't assume that n is a power of 2 (see alignSize) - // equal to divUp(sz, n) * n - size_t t = sz + n - 1; - size_t rem = t % n; - size_t result = t - rem; - return result; - } - -#ifdef PRINT_KERNEL_RUN_TIME - static double total_execute_time = 0; - static double total_kernel_time = 0; -#endif - void openCLExecuteKernel_(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3], - size_t localThreads[3], vector< pair > &args, int channels, - int depth, const char *build_options) - { - //construct kernel name - //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number - //for exmaple split_C2_D2, represent the split kernel with channels =2 and dataType Depth = 2(Data type is char) - stringstream idxStr; - if(channels != -1) - idxStr << "_C" << channels; - if(depth != -1) - idxStr << "_D" << depth; - kernelName += idxStr.str(); - - cl_kernel kernel; - kernel = openCLGetKernelFromSource(clCxt, source, kernelName, build_options); - - if ( localThreads != NULL) - { - globalThreads[0] = roundUp(globalThreads[0], localThreads[0]); - globalThreads[1] = roundUp(globalThreads[1], localThreads[1]); - globalThreads[2] = roundUp(globalThreads[2], localThreads[2]); - - cv::ocl::openCLVerifyKernel(clCxt, kernel, localThreads); - } - for(size_t i = 0; i < args.size(); i ++) - openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second)); - -#ifndef PRINT_KERNEL_RUN_TIME - openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads, - localThreads, 0, NULL, NULL)); -#else - cl_event event = NULL; - openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads, - localThreads, 0, NULL, &event)); - - cl_ulong start_time, end_time, queue_time; - double execute_time = 0; - double total_time = 0; - - openCLSafeCall(clWaitForEvents(1, &event)); - openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, - sizeof(cl_ulong), &start_time, 0)); - - openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, - sizeof(cl_ulong), &end_time, 0)); - - openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED, - sizeof(cl_ulong), &queue_time, 0)); - - execute_time = (double)(end_time - start_time) / (1000 * 1000); - total_time = (double)(end_time - queue_time) / (1000 * 1000); - - total_execute_time += execute_time; - total_kernel_time += total_time; - clReleaseEvent(event); -#endif - - clFlush(clCxt->impl->clCmdQueue); - openCLSafeCall(clReleaseKernel(kernel)); - } - - void openCLExecuteKernel(Context *clCxt , const char **source, string kernelName, - size_t globalThreads[3], size_t localThreads[3], - vector< pair > &args, int channels, int depth) - { - openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, - channels, depth, NULL); - } - void openCLExecuteKernel(Context *clCxt , const char **source, string kernelName, - size_t globalThreads[3], size_t localThreads[3], - vector< pair > &args, int channels, int depth, const char *build_options) - - { -#ifndef PRINT_KERNEL_RUN_TIME - openCLExecuteKernel_(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, - build_options); -#else - string data_type[] = { "uchar", "char", "ushort", "short", "int", "float", "double"}; - cout << endl; - cout << "Function Name: " << kernelName; - if(depth >= 0) - cout << " |data type: " << data_type[depth]; - cout << " |channels: " << channels; - cout << " |Time Unit: " << "ms" << endl; - - total_execute_time = 0; - total_kernel_time = 0; - cout << "-------------------------------------" << endl; - - cout << setiosflags(ios::left) << setw(15) << "excute time"; - cout << setiosflags(ios::left) << setw(15) << "lauch time"; - cout << setiosflags(ios::left) << setw(15) << "kernel time" << endl; - int i = 0; - for(i = 0; i < RUN_TIMES; i++) - openCLExecuteKernel_(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, - build_options); - - cout << "average kernel excute time: " << total_execute_time / RUN_TIMES << endl; // "ms" << endl; - cout << "average kernel total time: " << total_kernel_time / RUN_TIMES << endl; // "ms" << endl; -#endif - } - - double openCLExecuteKernelInterop(Context *clCxt , const char **source, string kernelName, - size_t globalThreads[3], size_t localThreads[3], - vector< pair > &args, int channels, int depth, const char *build_options, - bool finish, bool measureKernelTime, bool cleanUp) - - { - //construct kernel name - //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number - //for exmaple split_C2_D2, represent the split kernel with channels =2 and dataType Depth = 2(Data type is char) - stringstream idxStr; - if(channels != -1) - idxStr << "_C" << channels; - if(depth != -1) - idxStr << "_D" << depth; - kernelName += idxStr.str(); - - cl_kernel kernel; - kernel = openCLGetKernelFromSource(clCxt, source, kernelName, build_options); - - double kernelTime = 0.0; - - if( globalThreads != NULL) - { - if ( localThreads != NULL) - { - globalThreads[0] = divUp(globalThreads[0], localThreads[0]) * localThreads[0]; - globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1]; - globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2]; - - //size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2]; - cv::ocl::openCLVerifyKernel(clCxt, kernel, localThreads); - } - for(size_t i = 0; i < args.size(); i ++) - openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second)); - - if(measureKernelTime == false) - { - openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads, - localThreads, 0, NULL, NULL)); - } - else - { - cl_event event = NULL; - openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads, - localThreads, 0, NULL, &event)); - - cl_ulong end_time, queue_time; - - openCLSafeCall(clWaitForEvents(1, &event)); - - openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, - sizeof(cl_ulong), &end_time, 0)); - - openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED, - sizeof(cl_ulong), &queue_time, 0)); - - kernelTime = (double)(end_time - queue_time) / (1000 * 1000); - - clReleaseEvent(event); - } - } - - if(finish) - { - clFinish(clCxt->impl->clCmdQueue); - } - - if(cleanUp) - { - openCLSafeCall(clReleaseKernel(kernel)); - } - - return kernelTime; - } - - // Converts the contents of a file into a string - static int convertToString(const char *filename, std::string& s) - { - size_t size; - char* str; - - std::fstream f(filename, (std::fstream::in | std::fstream::binary)); - if(f.is_open()) - { - size_t fileSize; - f.seekg(0, std::fstream::end); - size = fileSize = (size_t)f.tellg(); - f.seekg(0, std::fstream::beg); - - str = new char[size+1]; - if(!str) - { - f.close(); - return -1; - } - - f.read(str, fileSize); - f.close(); - str[size] = '\0'; - - s = str; - delete[] str; - return 0; - } - printf("Error: Failed to open file %s\n", filename); - return -1; - } - - double openCLExecuteKernelInterop(Context *clCxt , const char **fileName, const int numFiles, string kernelName, - size_t globalThreads[3], size_t localThreads[3], - vector< pair > &args, int channels, int depth, const char *build_options, - bool finish, bool measureKernelTime, bool cleanUp) - - { - std::vector fsource; - for (int i = 0 ; i < numFiles ; i++) - { - std::string str; - if (convertToString(fileName[i], str) >= 0) - fsource.push_back(str); - } - const char **source = new const char *[numFiles]; - for (int i = 0 ; i < numFiles ; i++) - source[i] = fsource[i].c_str(); - double kernelTime = openCLExecuteKernelInterop(clCxt ,source, kernelName, globalThreads, localThreads, - args, channels, depth, build_options, finish, measureKernelTime, cleanUp); - fsource.clear(); - delete []source; - return kernelTime; - } - - cl_mem load_constant(cl_context context, cl_command_queue command_queue, const void *value, - const size_t size) - { - int status; - cl_mem con_struct; - - con_struct = clCreateBuffer(context, CL_MEM_READ_ONLY, size, NULL, &status); - openCLSafeCall(status); - - openCLSafeCall(clEnqueueWriteBuffer(command_queue, con_struct, 1, 0, size, - value, 0, 0, 0)); - - return con_struct; - - } - - /////////////////////////////OpenCL initialization///////////////// - auto_ptr Context::clCxt; - int Context::val = 0; - static Mutex cs; - static volatile int context_tear_down = 0; - - bool initialized() - { - return *((volatile int*)&Context::val) != 0 && - Context::clCxt->impl->clCmdQueue != NULL&& - Context::clCxt->impl->oclcontext != NULL; - } - - Context* Context::getContext() - { - if(*((volatile int*)&val) != 1) - { - AutoLock al(cs); - if(*((volatile int*)&val) != 1) - { - if (context_tear_down) - return clCxt.get(); - if( 0 == clCxt.get()) - clCxt.reset(new Context); - std::vector oclinfo; - CV_Assert(getDevice(oclinfo, CVCL_DEVICE_TYPE_ALL) > 0); - - *((volatile int*)&val) = 1; - } - } - return clCxt.get(); - } - - void Context::setContext(Info &oclinfo) - { - AutoLock guard(cs); - if(*((volatile int*)&val) != 1) - { - if( 0 == clCxt.get()) - clCxt.reset(new Context); - - clCxt.get()->impl = oclinfo.impl->copy(); - - *((volatile int*)&val) = 1; - } - else - { - clCxt.get()->impl->release(); - clCxt.get()->impl = oclinfo.impl->copy(); - } - } - - Context::Context() - { - impl = 0; - programCache = ProgramCache::getProgramCache(); - } - - Context::~Context() - { - release(); - } - - void Context::release() - { - if (impl) - impl->release(); - programCache->releaseProgram(); - } - - bool Context::supportsFeature(int ftype) const - { - switch(ftype) - { - case CL_DOUBLE: - return impl->double_support == 1; - case CL_UNIFIED_MEM: - return impl->unified_memory == 1; - case CL_VER_1_2: - return impl->clVersion.find("OpenCL 1.2") != string::npos; - default: - return false; - } - } - - size_t Context::computeUnits() const - { - return impl->maxComputeUnits; - } - - unsigned long queryLocalMemInfo() - { - Info::Impl* impl = Context::getContext()->impl; - cl_ulong local_memory_size = 0; - clGetDeviceInfo(impl->devices[impl->devnum], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), (void*)&local_memory_size, 0); - return local_memory_size; - } - - void* Context::oclContext() - { - return impl->oclcontext; - } - - void* Context::oclCommandQueue() - { - return impl->clCmdQueue; - } - - Info::Info() - { - impl = new Impl; - } - - void Info::release() - { - fft_teardown(); - clBlasTeardown(); - impl->release(); - impl = new Impl; - DeviceName.clear(); - } - - Info::~Info() - { - fft_teardown(); - clBlasTeardown(); - impl->release(); - } - - Info &Info::operator = (const Info &m) - { - impl->release(); - impl = m.impl->copy(); - DeviceName = m.DeviceName; - return *this; - } - - Info::Info(const Info &m) - { - impl = m.impl->copy(); - DeviceName = m.DeviceName; - } - }//namespace ocl - -}//namespace cv diff --git a/modules/ocl/src/knearest.cpp b/modules/ocl/src/knearest.cpp index fd9f2fed57..02dc72c4ea 100644 --- a/modules/ocl/src/knearest.cpp +++ b/modules/ocl/src/knearest.cpp @@ -44,17 +44,11 @@ //M*/ #include "precomp.hpp" +#include "opencl_kernels.hpp" + using namespace cv; using namespace cv::ocl; -namespace cv -{ - namespace ocl - { - extern const char* knearest;//knearest - } -} - KNearestNeighbour::KNearestNeighbour() { clear(); @@ -112,7 +106,7 @@ void KNearestNeighbour::find_nearest(const oclMat& samples, int k, oclMat& lable k1 = MIN( k1, k ); String kernel_name = "knn_find_nearest"; - cl_ulong local_memory_size = queryLocalMemInfo(); + cl_ulong local_memory_size = (cl_ulong)Context::getContext()->getDeviceInfo().localMemorySize; int nThreads = local_memory_size / (2 * k * 4); if(nThreads >= 256) nThreads = 256; @@ -122,7 +116,7 @@ void KNearestNeighbour::find_nearest(const oclMat& samples, int k, oclMat& lable size_t global_thread[] = {1, samples.rows, 1}; char build_option[50]; - if(!Context::getContext()->supportsFeature(Context::CL_DOUBLE)) + if(!Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE)) { sprintf(build_option, " "); }else diff --git a/modules/ocl/src/matrix_operations.cpp b/modules/ocl/src/matrix_operations.cpp index 3ae14eb48d..d247a14794 100644 --- a/modules/ocl/src/matrix_operations.cpp +++ b/modules/ocl/src/matrix_operations.cpp @@ -134,7 +134,6 @@ void cv::ocl::oclMat::upload(const Mat &m) Size wholeSize; Point ofs; m.locateROI(wholeSize, ofs); - create(wholeSize, m.type()); if (m.channels() == 3) @@ -142,13 +141,12 @@ void cv::ocl::oclMat::upload(const Mat &m) int pitch = wholeSize.width * 3 * m.elemSize1(); int tail_padding = m.elemSize1() * 3072; int err; - cl_mem temp = clCreateBuffer((cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE, + cl_mem temp = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, (pitch * wholeSize.height + tail_padding - 1) / tail_padding * tail_padding, 0, &err); openCLVerifyCall(err); openCLMemcpy2D(clCxt, temp, pitch, m.datastart, m.step, wholeSize.width * m.elemSize(), wholeSize.height, clMemcpyHostToDevice, 3); convert_C3C4(temp, *this); - openCLSafeCall(clReleaseMemObject(temp)); } else @@ -197,13 +195,12 @@ void cv::ocl::oclMat::download(cv::Mat &m) const int pitch = wholecols * 3 * m.elemSize1(); int tail_padding = m.elemSize1() * 3072; int err; - cl_mem temp = clCreateBuffer((cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE, + cl_mem temp = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, (pitch * wholerows + tail_padding - 1) / tail_padding * tail_padding, 0, &err); openCLVerifyCall(err); convert_C4C3(*this, temp); openCLMemcpy2D(clCxt, m.data, m.step, temp, pitch, wholecols * m.elemSize(), wholerows, clMemcpyDeviceToHost, 3); - openCLSafeCall(clReleaseMemObject(temp)); } else @@ -319,7 +316,7 @@ static void convert_run(const oclMat &src, oclMat &dst, double alpha, double bet void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double beta ) const { - if (!clCxt->supportsFeature(Context::CL_DOUBLE) && + if (!clCxt->supportsFeature(FEATURE_CL_DOUBLE) && (depth() == CV_64F || dst.depth() == CV_64F)) { CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n"); @@ -380,7 +377,7 @@ static void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, stri #ifdef CL_VERSION_1_2 // this enables backwards portability to // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support - if (Context::getContext()->supportsFeature(Context::CL_VER_1_2) && + if (Context::getContext()->supportsFeature(FEATURE_CL_VER_1_2) && dst.offset == 0 && dst.cols == dst.wholecols) { const int sizeofMap[][7] = @@ -392,7 +389,7 @@ static void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, stri }; int sizeofGeneric = sizeofMap[dst.oclchannels() - 1][dst.depth()]; - clEnqueueFillBuffer((cl_command_queue)dst.clCxt->oclCommandQueue(), + clEnqueueFillBuffer(getClCommandQueue(dst.clCxt), (cl_mem)dst.data, (void*)mat.data, sizeofGeneric, 0, dst.step * dst.rows, 0, NULL, NULL); } diff --git a/modules/ocl/src/mcwutil.cpp b/modules/ocl/src/mcwutil.cpp index fc94e2f3d8..e4e2e918fb 100644 --- a/modules/ocl/src/mcwutil.cpp +++ b/modules/ocl/src/mcwutil.cpp @@ -101,15 +101,15 @@ namespace cv for(size_t i = 0; i < args.size(); i ++) openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second)); - openCLSafeCall(clEnqueueNDRangeKernel((cl_command_queue)clCxt->oclCommandQueue(), kernel, 3, NULL, globalThreads, + openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(), kernel, 3, NULL, globalThreads, localThreads, 0, NULL, NULL)); switch(finish_mode) { case CLFINISH: - clFinish((cl_command_queue)clCxt->oclCommandQueue()); + clFinish(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr()); case CLFLUSH: - clFlush((cl_command_queue)clCxt->oclCommandQueue()); + clFlush(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr()); break; case DISABLE: default: @@ -178,7 +178,7 @@ namespace cv #ifdef CL_VERSION_1_2 //this enables backwards portability to //run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support - if(Context::getContext()->supportsFeature(Context::CL_VER_1_2)) + if(Context::getContext()->supportsFeature(FEATURE_CL_VER_1_2)) { cl_image_desc desc; desc.image_type = CL_MEM_OBJECT_IMAGE2D; @@ -191,13 +191,13 @@ namespace cv desc.buffer = NULL; desc.num_mip_levels = 0; desc.num_samples = 0; - texture = clCreateImage((cl_context)mat.clCxt->oclContext(), CL_MEM_READ_WRITE, &format, &desc, NULL, &err); + texture = clCreateImage(*(cl_context*)mat.clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, &format, &desc, NULL, &err); } else #endif { texture = clCreateImage2D( - (cl_context)mat.clCxt->oclContext(), + *(cl_context*)mat.clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, &format, mat.cols, @@ -212,22 +212,22 @@ namespace cv cl_mem devData; if (mat.cols * mat.elemSize() != mat.step) { - devData = clCreateBuffer((cl_context)mat.clCxt->oclContext(), CL_MEM_READ_ONLY, mat.cols * mat.rows + devData = clCreateBuffer(*(cl_context*)mat.clCxt->getOpenCLContextPtr(), CL_MEM_READ_ONLY, mat.cols * mat.rows * mat.elemSize(), NULL, NULL); const size_t regin[3] = {mat.cols * mat.elemSize(), mat.rows, 1}; - clEnqueueCopyBufferRect((cl_command_queue)mat.clCxt->oclCommandQueue(), (cl_mem)mat.data, devData, origin, origin, + clEnqueueCopyBufferRect(*(cl_command_queue*)mat.clCxt->getOpenCLCommandQueuePtr(), (cl_mem)mat.data, devData, origin, origin, regin, mat.step, 0, mat.cols * mat.elemSize(), 0, 0, NULL, NULL); - clFlush((cl_command_queue)mat.clCxt->oclCommandQueue()); + clFlush(*(cl_command_queue*)mat.clCxt->getOpenCLCommandQueuePtr()); } else { devData = (cl_mem)mat.data; } - clEnqueueCopyBufferToImage((cl_command_queue)mat.clCxt->oclCommandQueue(), devData, texture, 0, origin, region, 0, NULL, 0); + clEnqueueCopyBufferToImage(*(cl_command_queue*)mat.clCxt->getOpenCLCommandQueuePtr(), devData, texture, 0, origin, region, 0, NULL, 0); if ((mat.cols * mat.elemSize() != mat.step)) { - clFlush((cl_command_queue)mat.clCxt->oclCommandQueue()); + clFlush(*(cl_command_queue*)mat.clCxt->getOpenCLCommandQueuePtr()); clReleaseMemObject(devData); } @@ -259,7 +259,7 @@ namespace cv try { cv::ocl::openCLGetKernelFromSource(clCxt, &_kernel_string, "test_func"); - finish(); + cv::ocl::finish(); _support = true; } catch (const cv::Exception& e) diff --git a/modules/ocl/src/moments.cpp b/modules/ocl/src/moments.cpp index 926b94c9b3..24e8b3e0f6 100644 --- a/modules/ocl/src/moments.cpp +++ b/modules/ocl/src/moments.cpp @@ -106,7 +106,7 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom ) bool is_float = CV_SEQ_ELTYPE(contour) == CV_32FC2; - if (!cv::ocl::Context::getContext()->supportsFeature(Context::CL_DOUBLE) && is_float) + if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE) && is_float) { CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!"); } @@ -146,7 +146,7 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom ) cv::Mat dst(dst_a); a00 = a10 = a01 = a20 = a11 = a02 = a30 = a21 = a12 = a03 = 0.0; - if (!cv::ocl::Context::getContext()->supportsFeature(Context::CL_DOUBLE)) + if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE)) { for (int i = 0; i < contour->total; ++i) { diff --git a/modules/ocl/src/pyrdown.cpp b/modules/ocl/src/pyrdown.cpp index 5043da05dc..89df73e9a8 100644 --- a/modules/ocl/src/pyrdown.cpp +++ b/modules/ocl/src/pyrdown.cpp @@ -15,8 +15,8 @@ // Third party copyrights are property of their respective owners. // // @Authors -// Dachuan Zhao, dachuan@multicorewareinc.com -// Yao Wang, yao@multicorewareinc.com +// Dachuan Zhao, dachuan@multicorewareinc.com +// Yao Wang, yao@multicorewareinc.com // // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/ocl/src/pyrlk.cpp b/modules/ocl/src/pyrlk.cpp index cdcc8f231f..a69015d190 100644 --- a/modules/ocl/src/pyrlk.cpp +++ b/modules/ocl/src/pyrlk.cpp @@ -125,7 +125,7 @@ static void lkSparse_run(oclMat &I, oclMat &J, args.push_back( make_pair( sizeof(cl_int), (void *)&iters )); args.push_back( make_pair( sizeof(cl_char), (void *)&calcErr )); - bool is_cpu = queryDeviceInfo(); + bool is_cpu = isCpuDevice(); if (is_cpu) { openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), (char*)" -D CPU"); @@ -139,7 +139,7 @@ static void lkSparse_run(oclMat &I, oclMat &J, stringstream idxStr; idxStr << kernelName << "_C" << I.oclchannels() << "_D" << I.depth(); cl_kernel kernel = openCLGetKernelFromSource(clCxt, &pyrlk, idxStr.str()); - int wave_size = queryDeviceInfo(kernel); + int wave_size = (int)queryWaveFrontSize(kernel); openCLSafeCall(clReleaseKernel(kernel)); static char opt[32] = {0}; diff --git a/modules/ocl/src/pyrup.cpp b/modules/ocl/src/pyrup.cpp index 043031072c..01df30c518 100644 --- a/modules/ocl/src/pyrup.cpp +++ b/modules/ocl/src/pyrup.cpp @@ -15,8 +15,8 @@ // Third party copyrights are property of their respective owners. // // @Authors -// Zhang Chunpeng chunpeng@multicorewareinc.com -// Yao Wang, yao@multicorewareinc.com +// Zhang Chunpeng chunpeng@multicorewareinc.com +// Yao Wang, yao@multicorewareinc.com // // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/ocl/src/split_merge.cpp b/modules/ocl/src/split_merge.cpp index 79bd0f0e21..fb8d05aaa7 100644 --- a/modules/ocl/src/split_merge.cpp +++ b/modules/ocl/src/split_merge.cpp @@ -75,7 +75,7 @@ namespace cv { static void merge_vector_run(const oclMat *mat_src, size_t n, oclMat &mat_dst) { - if(!mat_dst.clCxt->supportsFeature(Context::CL_DOUBLE) && mat_dst.type() == CV_64F) + if(!mat_dst.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && mat_dst.type() == CV_64F) { CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n"); return; @@ -170,7 +170,7 @@ namespace cv static void split_vector_run(const oclMat &mat_src, oclMat *mat_dst) { - if(!mat_src.clCxt->supportsFeature(Context::CL_DOUBLE) && mat_src.type() == CV_64F) + if(!mat_src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && mat_src.type() == CV_64F) { CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n"); return; diff --git a/modules/ocl/src/stereo_csbp.cpp b/modules/ocl/src/stereo_csbp.cpp index 9052dc82bd..c8334cca42 100644 --- a/modules/ocl/src/stereo_csbp.cpp +++ b/modules/ocl/src/stereo_csbp.cpp @@ -150,10 +150,10 @@ namespace cv openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&rthis.min_disp_th)); openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&left.step)); openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&rthis.ndisp)); - openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL, + openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL)); - clFinish(*(cl_command_queue*)getoclCommandQueue()); + clFinish(*(cl_command_queue*)getClCommandQueuePtr()); openCLSafeCall(clReleaseKernel(kernel)); } @@ -200,9 +200,9 @@ namespace cv openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_int), (void *)&rthis.min_disp_th)); openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int), (void *)&cdisp_step1)); openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_int), (void *)&msg_step)); - openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 3, NULL, + openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 3, NULL, globalThreads, localThreads, 0, NULL, NULL)); - clFinish(*(cl_command_queue*)getoclCommandQueue()); + clFinish(*(cl_command_queue*)getClCommandQueuePtr()); openCLSafeCall(clReleaseKernel(kernel)); } @@ -235,10 +235,10 @@ namespace cv openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&msg_step)); openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&disp_step)); openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&rthis.ndisp)); - openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL, + openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL)); - clFinish(*(cl_command_queue*)getoclCommandQueue()); + clFinish(*(cl_command_queue*)getClCommandQueuePtr()); openCLSafeCall(clReleaseKernel(kernel)); } static void get_first_initial_global_caller(uchar *data_cost_selected, uchar *disp_selected_pyr, @@ -270,10 +270,10 @@ namespace cv openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&msg_step)); openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&disp_step)); openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&rthis.ndisp)); - openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL, + openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL)); - clFinish(*(cl_command_queue*)getoclCommandQueue()); + clFinish(*(cl_command_queue*)getClCommandQueuePtr()); openCLSafeCall(clReleaseKernel(kernel)); } @@ -340,10 +340,10 @@ namespace cv openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_float), (void *)&rthis.max_data_term)); openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int), (void *)&left.step)); openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_int), (void *)&rthis.min_disp_th)); - openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL, + openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL)); - clFinish(*(cl_command_queue*)getoclCommandQueue()); + clFinish(*(cl_command_queue*)getClCommandQueuePtr()); openCLSafeCall(clReleaseKernel(kernel)); } static void compute_data_cost_reduce_caller(uchar *disp_selected_pyr, uchar *data_cost, @@ -391,10 +391,10 @@ namespace cv openCLSafeCall(clSetKernelArg(kernel, 17, sizeof(cl_float), (void *)&rthis.max_data_term)); openCLSafeCall(clSetKernelArg(kernel, 18, sizeof(cl_int), (void *)&left.step)); openCLSafeCall(clSetKernelArg(kernel, 19, sizeof(cl_int), (void *)&rthis.min_disp_th)); - openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 3, NULL, + openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 3, NULL, globalThreads, localThreads, 0, NULL, NULL)); - clFinish(*(cl_command_queue*)getoclCommandQueue()); + clFinish(*(cl_command_queue*)getClCommandQueuePtr()); openCLSafeCall(clReleaseKernel(kernel)); } static void compute_data_cost(uchar *disp_selected_pyr, uchar *data_cost, StereoConstantSpaceBP &rthis, @@ -458,10 +458,10 @@ namespace cv openCLSafeCall(clSetKernelArg(kernel, 20, sizeof(cl_int), (void *)&disp_step2)); openCLSafeCall(clSetKernelArg(kernel, 21, sizeof(cl_int), (void *)&msg_step1)); openCLSafeCall(clSetKernelArg(kernel, 22, sizeof(cl_int), (void *)&msg_step2)); - openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL, + openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL)); - clFinish(*(cl_command_queue*)getoclCommandQueue()); + clFinish(*(cl_command_queue*)getClCommandQueuePtr()); openCLSafeCall(clReleaseKernel(kernel)); } //////////////////////////////////////////////////////////////////////////////////////////////// @@ -500,10 +500,10 @@ namespace cv openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&disp_step)); openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&msg_step)); openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_float), (void *)&rthis.disc_single_jump)); - openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL, + openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL)); - clFinish(*(cl_command_queue*)getoclCommandQueue()); + clFinish(*(cl_command_queue*)getClCommandQueuePtr()); openCLSafeCall(clReleaseKernel(kernel)); } static void calc_all_iterations(uchar *u, uchar *d, uchar *l, uchar *r, uchar *data_cost_selected, @@ -552,10 +552,10 @@ namespace cv openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&nr_plane)); openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&msg_step)); openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&disp_step)); - openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL, + openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL)); - clFinish(*(cl_command_queue*)getoclCommandQueue()); + clFinish(*(cl_command_queue*)getClCommandQueuePtr()); openCLSafeCall(clReleaseKernel(kernel)); } } diff --git a/modules/ocl/src/stereobp.cpp b/modules/ocl/src/stereobp.cpp index fe9136057b..5bc93aa3f5 100644 --- a/modules/ocl/src/stereobp.cpp +++ b/modules/ocl/src/stereobp.cpp @@ -95,7 +95,10 @@ namespace cv con_struct -> cmax_disc_term = max_disc_term; con_struct -> cdisc_single_jump = disc_single_jump; - cl_con_struct = load_constant(*((cl_context*)getoclContext()), *((cl_command_queue*)getoclCommandQueue()), (void *)con_struct, + Context* clCtx = Context::getContext(); + cl_context clContext = *(cl_context*)(clCtx->getOpenCLContextPtr()); + cl_command_queue clCmdQueue = *(cl_command_queue*)(clCtx->getOpenCLCommandQueuePtr()); + cl_con_struct = load_constant(clContext, clCmdQueue, (void *)con_struct, sizeof(con_struct_t)); delete con_struct; diff --git a/modules/ocl/src/tvl1flow.cpp b/modules/ocl/src/tvl1flow.cpp index 606ac530f7..c9a3f7abc1 100644 --- a/modules/ocl/src/tvl1flow.cpp +++ b/modules/ocl/src/tvl1flow.cpp @@ -15,7 +15,7 @@ // Third party copyrights are property of their respective owners. // // @Authors -// Jin Ma, jin@multicorewareinc.com +// Jin Ma, jin@multicorewareinc.com // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: // diff --git a/modules/ocl/test/main.cpp b/modules/ocl/test/main.cpp index 594c196a59..8071102bad 100644 --- a/modules/ocl/test/main.cpp +++ b/modules/ocl/test/main.cpp @@ -80,18 +80,18 @@ int main(int argc, char **argv) const char *keys = "{ h | help | false | print help message }" "{ t | type | gpu | set device type:cpu or gpu}" - "{ p | platform | 0 | set platform id }" + "{ p | platform | -1 | set platform id }" "{ d | device | 0 | set device id }"; CommandLineParser cmd(argc, argv, keys); if (cmd.get("help")) { - cout << "Avaible options besides goole test option:" << endl; + cout << "Available options besides google test option:" << endl; cmd.printParams(); return 0; } string type = cmd.get("type"); - unsigned int pid = cmd.get("platform"); + int pid = cmd.get("platform"); int device = cmd.get("device"); print_info(); @@ -100,24 +100,29 @@ int main(int argc, char **argv) { flag = CVCL_DEVICE_TYPE_CPU; } - std::vector oclinfo; - int devnums = getDevice(oclinfo, flag); - if(devnums <= device || device < 0) + + cv::ocl::PlatformsInfo platformsInfo; + cv::ocl::getOpenCLPlatforms(platformsInfo); + if (pid >= (int)platformsInfo.size()) { - std::cout << "device invalid\n"; - return -1; + std::cout << "platform is invalid\n"; + return 1; } - if(pid >= oclinfo.size()) + + cv::ocl::DevicesInfo devicesInfo; + int devnums = cv::ocl::getOpenCLDevices(devicesInfo, flag, (pid < 0) ? NULL : platformsInfo[pid]); + if (device < 0 || device >= devnums) { - std::cout << "platform invalid\n"; - return -1; + std::cout << "device/platform invalid\n"; + return 1; } - setDevice(oclinfo[pid], device); - + cv::ocl::setDevice(devicesInfo[device]); setBinaryDiskCache(CACHE_UPDATE); - cout << "Device type:" << type << endl << "Device name:" << oclinfo[pid].DeviceName[device] << endl; + cout << "Device type: " << type << endl + << "Platform name: " << devicesInfo[device]->platform->platformName << endl + << "Device name: " << devicesInfo[device]->deviceName << endl; return RUN_ALL_TESTS(); } diff --git a/modules/superres/perf/perf_superres_ocl.cpp b/modules/superres/perf/perf_superres_ocl.cpp index 0b9864cbd3..822b87f441 100644 --- a/modules/superres/perf/perf_superres_ocl.cpp +++ b/modules/superres/perf/perf_superres_ocl.cpp @@ -107,9 +107,6 @@ PERF_TEST_P(Size_MatType, SuperResolution_BTVL1_OCL, Combine(Values(szSmall64, szSmall128), Values(MatType(CV_8UC1), MatType(CV_8UC3)))) { - std::vectorinfo; - cv::ocl::getDevice(info); - declare.time(5 * 60); const Size size = std::tr1::get<0>(GetParam()); diff --git a/modules/superres/src/btv_l1_ocl.cpp b/modules/superres/src/btv_l1_ocl.cpp index 2f27d50259..5aecca0630 100644 --- a/modules/superres/src/btv_l1_ocl.cpp +++ b/modules/superres/src/btv_l1_ocl.cpp @@ -232,7 +232,7 @@ void btv_l1_device_ocl::calcBtvRegularization(const oclMat& src, oclMat& dst, in cl_mem c_btvRegWeights; size_t count = btvWeights_size * sizeof(float); c_btvRegWeights = openCLCreateBuffer(clCxt, CL_MEM_READ_ONLY, count); - int cl_safe_check = clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), c_btvRegWeights, 1, 0, count, btvWeights_, 0, NULL, NULL); + int cl_safe_check = clEnqueueWriteBuffer(getClCommandQueue(clCxt), c_btvRegWeights, 1, 0, count, btvWeights_, 0, NULL, NULL); CV_Assert(cl_safe_check == CL_SUCCESS); args.push_back(make_pair(sizeof(cl_mem), (void*)&src_.data)); diff --git a/modules/superres/test/test_superres.cpp b/modules/superres/test/test_superres.cpp index 1530d6d667..5cb078f77c 100644 --- a/modules/superres/test/test_superres.cpp +++ b/modules/superres/test/test_superres.cpp @@ -278,8 +278,6 @@ TEST_F(SuperResolution, BTVL1_GPU) #if defined(HAVE_OPENCV_OCL) && defined(HAVE_OPENCL) TEST_F(SuperResolution, BTVL1_OCL) { - std::vector infos; - cv::ocl::getDevice(infos); RunTest(cv::superres::createSuperResolution_BTVL1_OCL()); } #endif