From e8d9ed89559ad33167d23e562b52d18bb0c9f817 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.alekhin@itseez.com>
Date: Fri, 20 Sep 2013 19:19:52 +0400
Subject: [PATCH] ocl: split initialization.cpp into 3 files: context,
 operations, programcache

---
 modules/nonfree/src/surf.ocl.cpp              |    2 +-
 modules/ocl/include/opencv2/ocl/ocl.hpp       |  148 ++-
 .../ocl/include/opencv2/ocl/private/util.hpp  |  242 ++--
 modules/ocl/perf/main.cpp                     |   28 +-
 modules/ocl/src/arithm.cpp                    |  122 +-
 modules/ocl/src/bgfg_mog.cpp                  |    4 +-
 modules/ocl/src/binarycaching.hpp             |   55 +-
 modules/ocl/src/brute_force_matcher.cpp       |    8 +-
 modules/ocl/src/canny.cpp                     |    8 +-
 modules/ocl/src/cl_context.cpp                |  507 ++++++++
 modules/ocl/src/cl_operations.cpp             |  434 +++++++
 modules/ocl/src/cl_programcache.cpp           |  311 +++++
 modules/ocl/src/error.cpp                     |   16 +-
 modules/ocl/src/fft.cpp                       |   44 +-
 modules/ocl/src/filtering.cpp                 |    2 +-
 modules/ocl/src/gemm.cpp                      |    2 +-
 modules/ocl/src/gftt.cpp                      |    2 +-
 modules/ocl/src/haar.cpp                      |   14 +-
 modules/ocl/src/hog.cpp                       |   14 +-
 modules/ocl/src/imgproc.cpp                   |   38 +-
 modules/ocl/src/initialization.cpp            | 1090 -----------------
 modules/ocl/src/knearest.cpp                  |   14 +-
 modules/ocl/src/matrix_operations.cpp         |   13 +-
 modules/ocl/src/mcwutil.cpp                   |   24 +-
 modules/ocl/src/moments.cpp                   |    4 +-
 modules/ocl/src/pyrdown.cpp                   |    4 +-
 modules/ocl/src/pyrlk.cpp                     |    4 +-
 modules/ocl/src/pyrup.cpp                     |    4 +-
 modules/ocl/src/split_merge.cpp               |    4 +-
 modules/ocl/src/stereo_csbp.cpp               |   36 +-
 modules/ocl/src/stereobp.cpp                  |    5 +-
 modules/ocl/src/tvl1flow.cpp                  |    2 +-
 modules/ocl/test/main.cpp                     |   33 +-
 modules/superres/perf/perf_superres_ocl.cpp   |    3 -
 modules/superres/src/btv_l1_ocl.cpp           |    2 +-
 modules/superres/test/test_superres.cpp       |    2 -
 36 files changed, 1705 insertions(+), 1540 deletions(-)
 create mode 100644 modules/ocl/src/cl_context.cpp
 create mode 100644 modules/ocl/src/cl_operations.cpp
 create mode 100644 modules/ocl/src/cl_programcache.cpp
 delete mode 100644 modules/ocl/src/initialization.cpp
diff --git a/modules/nonfree/src/surf.ocl.cpp b/modules/nonfree/src/surf.ocl.cpp
index f8c1ad7294..59eab705d6 100644
--- a/modules/nonfree/src/surf.ocl.cpp
+++ b/modules/nonfree/src/surf.ocl.cpp
@@ -74,7 +74,7 @@ namespace cv
             }
             cl_kernel kernel;
             kernel = openCLGetKernelFromSource(clCxt, source, kernelName, optBufPtr);
-            size_t wave_size = queryDeviceInfo<WAVEFRONT_SIZE, size_t>(kernel);
+            size_t wave_size = queryWaveFrontSize(kernel);
             CV_Assert(clReleaseKernel(kernel) == CL_SUCCESS);
             sprintf(optBufPtr, "-D WAVE_SIZE=%d", static_cast<int>(wave_size));
             openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, optBufPtr);
diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp
index c296f57a3d..21bb607471 100644
--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl/ocl.hpp
@@ -57,8 +57,7 @@ namespace cv
 {
     namespace ocl
     {
-        using std::auto_ptr;
-        enum
+        enum DeviceType
         {
             CVCL_DEVICE_TYPE_DEFAULT     = (1 << 0),
             CVCL_DEVICE_TYPE_CPU         = (1 << 1),
@@ -93,77 +92,113 @@ namespace cv
         //return -1 if the target type is unsupported, otherwise return 0
         CV_EXPORTS int setDevMemType(DevMemRW rw_type = DEVICE_MEM_R_W, DevMemType mem_type = DEVICE_MEM_DEFAULT);
 
-        //this class contains ocl runtime information
-        class CV_EXPORTS Info
+        // these classes contain OpenCL runtime information
+
+        struct PlatformInfo;
+
+        struct DeviceInfo
         {
-        public:
-            struct Impl;
-            Impl *impl;
+            int _id; // reserved, don't use it
 
-            Info();
-            Info(const Info &m);
-            ~Info();
-            void release();
-            Info &operator = (const Info &m);
-            std::vector<string> DeviceName;
+            DeviceType deviceType;
+            std::string deviceProfile;
+            std::string deviceVersion;
+            std::string deviceName;
+            std::string deviceVendor;
+            int deviceVendorId;
+            std::string deviceDriverVersion;
+            std::string deviceExtensions;
+
+            size_t maxWorkGroupSize;
+            std::vector<size_t> maxWorkItemSizes;
+            int maxComputeUnits;
+            size_t localMemorySize;
+
+            int deviceVersionMajor;
+            int deviceVersionMinor;
+
+            bool haveDoubleSupport;
+            bool isUnifiedMemory; // 1 means integrated GPU, otherwise this value is 0
+
+            std::string compilationExtraOptions;
+
+            const PlatformInfo* platform;
+
+            DeviceInfo();
+        };
+
+        struct PlatformInfo
+        {
+            int _id; // reserved, don't use it
+
+            std::string platformProfile;
+            std::string platformVersion;
+            std::string platformName;
+            std::string platformVendor;
+            std::string platformExtensons;
+
+            int platformVersionMajor;
+            int platformVersionMinor;
+
+            std::vector<const DeviceInfo*> devices;
+
+            PlatformInfo();
         };
-        //////////////////////////////// Initialization & Info ////////////////////////
-        //this function may be obsoleted
-        //CV_EXPORTS cl_device_id getDevice();
-        //the function must be called before any other cv::ocl::functions, it initialize ocl runtime
-        //each Info relates to an OpenCL platform
-        //there is one or more devices in each platform, each one has a separate name
-        CV_EXPORTS int getDevice(std::vector<Info> &oclinfo, int devicetype = CVCL_DEVICE_TYPE_GPU);
 
-        //set device you want to use, optional function after getDevice be called
-        //the devnum is the index of the selected device in DeviceName vector of INfo
-        CV_EXPORTS void setDevice(Info &oclinfo, int devnum = 0);
+        //////////////////////////////// Initialization & Info ////////////////////////
+        typedef std::vector<const PlatformInfo*> PlatformsInfo;
 
-        //The two functions below enable other opencl program to use ocl module's cl_context and cl_command_queue
-        //returns cl_context *
-        CV_EXPORTS void* getoclContext();
-        //returns cl_command_queue *
-        CV_EXPORTS void* getoclCommandQueue();
+        CV_EXPORTS int getOpenCLPlatforms(PlatformsInfo& platforms);
 
-        //explicit call clFinish. The global command queue will be used.
-        CV_EXPORTS void finish();
+        typedef std::vector<const DeviceInfo*> DevicesInfo;
 
-        //this function enable ocl module to use customized cl_context and cl_command_queue
-        //getDevice also need to be called before this function
-        CV_EXPORTS void setDeviceEx(Info &oclinfo, void *ctx, void *qu, int devnum = 0);
+        CV_EXPORTS int getOpenCLDevices(DevicesInfo& devices, int deviceType = CVCL_DEVICE_TYPE_GPU,
+                const PlatformInfo* platform = NULL);
 
-        //returns true when global OpenCL context is initialized
-        CV_EXPORTS bool initialized();
+        // set device you want to use
+        CV_EXPORTS void setDevice(const DeviceInfo* info);
 
         //////////////////////////////// Error handling ////////////////////////
         CV_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func);
 
-        //////////////////////////////// OpenCL context ////////////////////////
-        //This is a global singleton class used to represent a OpenCL context.
+        enum FEATURE_TYPE
+        {
+            FEATURE_CL_DOUBLE = 1,
+            FEATURE_CL_UNIFIED_MEM,
+            FEATURE_CL_VER_1_2
+        };
+
+        // Represents OpenCL context, interface
         class CV_EXPORTS Context
         {
         protected:
-            Context();
-            friend class auto_ptr<Context>;
-            friend bool initialized();
-        private:
-            static auto_ptr<Context> clCxt;
-            static int val;
+            Context() { }
+            ~Context() { }
         public:
-            ~Context();
-            void release();
-            Info::Impl* impl;
-
             static Context* getContext();
-            static void setContext(Info &oclinfo);
 
-            enum {CL_DOUBLE, CL_UNIFIED_MEM, CL_VER_1_2};
-            bool supportsFeature(int ftype) const;
-            size_t computeUnits() const;
-            void* oclContext();
-            void* oclCommandQueue();
+            bool supportsFeature(FEATURE_TYPE featureType) const;
+            const DeviceInfo& getDeviceInfo() const;
+
+            const void* getOpenCLContextPtr() const;
+            const void* getOpenCLCommandQueuePtr() const;
+            const void* getOpenCLDeviceIDPtr() const;
         };
 
+        inline const void *getClContextPtr()
+        {
+            return Context::getContext()->getOpenCLContextPtr();
+        }
+
+        inline const void *getClCommandQueuePtr()
+        {
+            return Context::getContext()->getOpenCLCommandQueuePtr();
+        }
+
+        bool CV_EXPORTS supportsFeature(FEATURE_TYPE featureType);
+
+        void CV_EXPORTS finish();
+
         //! Calls a kernel, by string. Pass globalThreads = NULL, and cleanUp = true, to finally clean-up without executing.
         CV_EXPORTS double openCLExecuteKernelInterop(Context *clCxt ,
                                                         const char **source, string kernelName,
@@ -384,7 +419,7 @@ namespace cv
             uchar *dataend;
 
             //! OpenCL context associated with the oclMat object.
-            Context *clCxt;
+            Context *clCxt; // TODO clCtx
             //add offset for handle ROI, calculated in byte
             int offset;
             //add wholerows and wholecols for the whole matrix, datastart and dataend are no longer used
@@ -1879,11 +1914,6 @@ namespace cv
             oclMat temp5;
         };
 
-        static inline size_t divUp(size_t total, size_t grain)
-        {
-            return (total + grain - 1) / grain;
-        }
-
         /*!***************K Nearest Neighbour*************!*/
         class CV_EXPORTS KNearestNeighbour: public CvKNearest
         {
diff --git a/modules/ocl/include/opencv2/ocl/private/util.hpp b/modules/ocl/include/opencv2/ocl/private/util.hpp
index 3de0d438d4..2aba472f66 100644
--- a/modules/ocl/include/opencv2/ocl/private/util.hpp
+++ b/modules/ocl/include/opencv2/ocl/private/util.hpp
@@ -52,120 +52,138 @@
 
 namespace cv
 {
-    namespace ocl
+namespace ocl
+{
+
+inline cl_device_id getClDeviceID(const Context *ctx)
+{
+    return *(cl_device_id*)(ctx->getOpenCLDeviceIDPtr());
+}
+
+inline cl_context getClContext(const Context *ctx)
+{
+    return *(cl_context*)(ctx->getOpenCLContextPtr());
+}
+
+inline cl_command_queue getClCommandQueue(const Context *ctx)
+{
+    return *(cl_command_queue*)(ctx->getOpenCLCommandQueuePtr());
+}
+
+enum openCLMemcpyKind
+{
+    clMemcpyHostToDevice = 0,
+    clMemcpyDeviceToHost,
+    clMemcpyDeviceToDevice
+};
+///////////////////////////OpenCL call wrappers////////////////////////////
+void CV_EXPORTS openCLMallocPitch(Context *clCxt, void **dev_ptr, size_t *pitch,
+                                  size_t widthInBytes, size_t height);
+void CV_EXPORTS openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch,
+                                    size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type);
+void CV_EXPORTS openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
+                               const void *src, size_t spitch,
+                               size_t width, size_t height, openCLMemcpyKind kind, int channels = -1);
+void CV_EXPORTS openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset,
+                                   const void *src, size_t spitch,
+                                   size_t width, size_t height, int src_offset);
+void CV_EXPORTS openCLFree(void *devPtr);
+cl_mem CV_EXPORTS openCLCreateBuffer(Context *clCxt, size_t flag, size_t size);
+void CV_EXPORTS openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size);
+cl_kernel CV_EXPORTS openCLGetKernelFromSource(const Context *clCxt,
+                                               const char **source, std::string kernelName);
+cl_kernel CV_EXPORTS openCLGetKernelFromSource(const Context *clCxt,
+                                               const char **source, std::string kernelName, const char *build_options);
+void CV_EXPORTS openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads);
+void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, string kernelName, std::vector< std::pair<size_t, const void *> > &args,
+                         int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1);
+void CV_EXPORTS openCLExecuteKernel_(Context *clCxt , const char **source, std::string kernelName,
+                          size_t globalThreads[3], size_t localThreads[3],
+                          std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, const char *build_options);
+void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
+                         size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth);
+void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
+                         size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels,
+                         int depth, const char *build_options);
+
+cl_mem CV_EXPORTS load_constant(cl_context context, cl_command_queue command_queue, const void *value,
+                     const size_t size);
+
+cl_mem CV_EXPORTS openCLMalloc(cl_context clCxt, size_t size, cl_mem_flags flags, void *host_ptr);
+
+int CV_EXPORTS savetofile(const Context *clcxt,  cl_program &program, const char *fileName);
+
+enum FLUSH_MODE
+{
+    CLFINISH = 0,
+    CLFLUSH,
+    DISABLE
+};
+
+void CV_EXPORTS openCLExecuteKernel2(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
+                          size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, FLUSH_MODE finish_mode = DISABLE);
+void CV_EXPORTS openCLExecuteKernel2(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
+                          size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels,
+                          int depth, char *build_options, FLUSH_MODE finish_mode = DISABLE);
+// bind oclMat to OpenCL image textures
+// note:
+//   1. there is no memory management. User need to explicitly release the resource
+//   2. for faster clamping, there is no buffer padding for the constructed texture
+cl_mem CV_EXPORTS bindTexture(const oclMat &mat);
+void CV_EXPORTS releaseTexture(cl_mem& texture);
+
+//Represents an image texture object
+class CV_EXPORTS TextureCL
+{
+public:
+    TextureCL(cl_mem tex, int r, int c, int t)
+        : tex_(tex), rows(r), cols(c), type(t) {}
+    ~TextureCL()
+    {
+        openCLFree(tex_);
+    }
+    operator cl_mem()
     {
-        enum openCLMemcpyKind
-        {
-            clMemcpyHostToDevice = 0,
-            clMemcpyDeviceToHost,
-            clMemcpyDeviceToDevice
-        };
-        ///////////////////////////OpenCL call wrappers////////////////////////////
-        void CV_EXPORTS openCLMallocPitch(Context *clCxt, void **dev_ptr, size_t *pitch,
-                                          size_t widthInBytes, size_t height);
-        void CV_EXPORTS openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch,
-                                            size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type);
-        void CV_EXPORTS openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
-                                       const void *src, size_t spitch,
-                                       size_t width, size_t height, openCLMemcpyKind kind, int channels = -1);
-        void CV_EXPORTS openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset,
-                                           const void *src, size_t spitch,
-                                           size_t width, size_t height, int src_offset);
-        void CV_EXPORTS openCLFree(void *devPtr);
-        cl_mem CV_EXPORTS openCLCreateBuffer(Context *clCxt, size_t flag, size_t size);
-        void CV_EXPORTS openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size);
-        cl_kernel CV_EXPORTS openCLGetKernelFromSource(const Context *clCxt,
-                                                       const char **source, std::string kernelName);
-        cl_kernel CV_EXPORTS openCLGetKernelFromSource(const Context *clCxt,
-                                                       const char **source, std::string kernelName, const char *build_options);
-        void CV_EXPORTS openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads);
-        void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, string kernelName, std::vector< std::pair<size_t, const void *> > &args,
-                                 int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1);
-        void CV_EXPORTS openCLExecuteKernel_(Context *clCxt , const char **source, std::string kernelName,
-                                  size_t globalThreads[3], size_t localThreads[3],
-                                  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, const char *build_options);
-        void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
-                                 size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth);
-        void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
-                                 size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels,
-                                 int depth, const char *build_options);
-
-        cl_mem CV_EXPORTS load_constant(cl_context context, cl_command_queue command_queue, const void *value,
-                             const size_t size);
-
-        cl_mem CV_EXPORTS openCLMalloc(cl_context clCxt, size_t size, cl_mem_flags flags, void *host_ptr);
-
-        int CV_EXPORTS savetofile(const Context *clcxt,  cl_program &program, const char *fileName);
-
-        enum FLUSH_MODE
-        {
-            CLFINISH = 0,
-            CLFLUSH,
-            DISABLE
-        };
-
-        void CV_EXPORTS openCLExecuteKernel2(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
-                                  size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, FLUSH_MODE finish_mode = DISABLE);
-        void CV_EXPORTS openCLExecuteKernel2(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
-                                  size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels,
-                                  int depth, char *build_options, FLUSH_MODE finish_mode = DISABLE);
-        // bind oclMat to OpenCL image textures
-        // note:
-        //   1. there is no memory management. User need to explicitly release the resource
-        //   2. for faster clamping, there is no buffer padding for the constructed texture
-        cl_mem CV_EXPORTS bindTexture(const oclMat &mat);
-        void CV_EXPORTS releaseTexture(cl_mem& texture);
-
-        //Represents an image texture object
-        class CV_EXPORTS TextureCL
-        {
-        public:
-            TextureCL(cl_mem tex, int r, int c, int t)
-                : tex_(tex), rows(r), cols(c), type(t) {}
-            ~TextureCL()
-            {
-                openCLFree(tex_);
-            }
-            operator cl_mem()
-            {
-                return tex_;
-            }
-            cl_mem const tex_;
-            const int rows;
-            const int cols;
-            const int type;
-        private:
-            //disable assignment
-            void operator=(const TextureCL&);
-        };
-        // bind oclMat to OpenCL image textures and retunrs an TextureCL object
-        // note:
-        //   for faster clamping, there is no buffer padding for the constructed texture
-        Ptr<TextureCL> CV_EXPORTS bindTexturePtr(const oclMat &mat);
-
-        // returns whether the current context supports image2d_t format or not
-        bool CV_EXPORTS support_image2d(Context *clCxt = Context::getContext());
-
-        // the enums are used to query device information
-        // currently only support wavefront size queries
-        enum DEVICE_INFO
-        {
-            WAVEFRONT_SIZE,             //in AMD speak
-            IS_CPU_DEVICE               //check if the device is CPU
-        };
-        template<DEVICE_INFO _it, typename _ty>
-        _ty queryDeviceInfo(cl_kernel kernel = NULL);
-
-        template<>
-        int CV_EXPORTS queryDeviceInfo<WAVEFRONT_SIZE, int>(cl_kernel kernel);
-        template<>
-        size_t CV_EXPORTS queryDeviceInfo<WAVEFRONT_SIZE, size_t>(cl_kernel kernel);
-        template<>
-        bool CV_EXPORTS queryDeviceInfo<IS_CPU_DEVICE, bool>(cl_kernel kernel);
-
-        unsigned long CV_EXPORTS queryLocalMemInfo();
-    }//namespace ocl
+        return tex_;
+    }
+    cl_mem const tex_;
+    const int rows;
+    const int cols;
+    const int type;
+private:
+    //disable assignment
+    void operator=(const TextureCL&);
+};
+// bind oclMat to OpenCL image textures and retunrs an TextureCL object
+// note:
+//   for faster clamping, there is no buffer padding for the constructed texture
+Ptr<TextureCL> CV_EXPORTS bindTexturePtr(const oclMat &mat);
+
+// returns whether the current context supports image2d_t format or not
+bool CV_EXPORTS support_image2d(Context *clCxt = Context::getContext());
+
+bool CV_EXPORTS isCpuDevice();
+
+size_t CV_EXPORTS queryWaveFrontSize(cl_kernel kernel);
 
+
+
+inline size_t divUp(size_t total, size_t grain)
+{
+    return (total + grain - 1) / grain;
+}
+
+inline size_t roundUp(size_t sz, size_t n)
+{
+    // we don't assume that n is a power of 2 (see alignSize)
+    // equal to divUp(sz, n) * n
+    size_t t = sz + n - 1;
+    size_t rem = t % n;
+    size_t result = t - rem;
+    return result;
+}
+
+}//namespace ocl
 }//namespace cv
 
 #endif //__OPENCV_OCL_PRIVATE_UTIL__
diff --git a/modules/ocl/perf/main.cpp b/modules/ocl/perf/main.cpp
index e24c2c14e5..e82af4e322 100644
--- a/modules/ocl/perf/main.cpp
+++ b/modules/ocl/perf/main.cpp
@@ -56,7 +56,7 @@ int main(int argc, char ** argv)
     const char * keys =
         "{ h | help     | false              | print help message }"
         "{ t | type     | gpu                | set device type:cpu or gpu}"
-        "{ p | platform | 0                  | set platform id }"
+        "{ p | platform | -1                 | set platform id }"
         "{ d | device   | 0                  | set device id }";
 
     CommandLineParser cmd(argc, argv, keys);
@@ -68,28 +68,34 @@ int main(int argc, char ** argv)
     }
 
     string type = cmd.get<string>("type");
-    unsigned int pid = cmd.get<unsigned int>("platform");
+    int pid = cmd.get<int>("platform");
     int device = cmd.get<int>("device");
 
     int flag = type == "cpu" ? cv::ocl::CVCL_DEVICE_TYPE_CPU :
                                cv::ocl::CVCL_DEVICE_TYPE_GPU;
 
-    std::vector<cv::ocl::Info> oclinfo;
-    int devnums = cv::ocl::getDevice(oclinfo, flag);
-    if (devnums <= device || device < 0)
+    cv::ocl::PlatformsInfo platformsInfo;
+    cv::ocl::getOpenCLPlatforms(platformsInfo);
+    if (pid >= (int)platformsInfo.size())
     {
-        std::cout << "device invalid\n";
-        return -1;
+        std::cout << "platform is invalid\n";
+        return 1;
     }
 
-    if (pid >= oclinfo.size())
+    cv::ocl::DevicesInfo devicesInfo;
+    int devnums = cv::ocl::getOpenCLDevices(devicesInfo, flag, (pid < 0) ? NULL : platformsInfo[pid]);
+    if (device < 0 || device >= devnums)
     {
-        std::cout << "platform invalid\n";
-        return -1;
+        std::cout << "device/platform invalid\n";
+        return 1;
     }
 
-    cv::ocl::setDevice(oclinfo[pid], device);
+    cv::ocl::setDevice(devicesInfo[device]);
     cv::ocl::setBinaryDiskCache(cv::ocl::CACHE_UPDATE);
 
+    cout << "Device type:" << type << endl
+            << "Platform name:" << devicesInfo[device]->platform->platformName << endl
+            << "Device name:" << devicesInfo[device]->deviceName << endl;
+
     CV_PERF_TEST_MAIN_INTERNALS(ocl, impls)
 }
diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp
index 2a663b990a..7d97e67545 100644
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@ -51,50 +51,10 @@
 //M*/
 
 #include "precomp.hpp"
-#include <iomanip>
-
+#include "opencl_kernels.hpp"
 
 using namespace cv;
 using namespace cv::ocl;
-using namespace std;
-
-namespace cv
-{
-    namespace ocl
-    {
-        //////////////////////////////// OpenCL kernel strings /////////////////////
-
-        extern const char *arithm_absdiff_nonsaturate;
-        extern const char *arithm_nonzero;
-        extern const char *arithm_sum;
-        extern const char *arithm_minMax;
-        extern const char *arithm_minMaxLoc;
-        extern const char *arithm_minMaxLoc_mask;
-        extern const char *arithm_LUT;
-        extern const char *arithm_add;
-        extern const char *arithm_add_mask;
-        extern const char *arithm_add_scalar;
-        extern const char *arithm_add_scalar_mask;
-        extern const char *arithm_bitwise_binary;
-        extern const char *arithm_bitwise_binary_mask;
-        extern const char *arithm_bitwise_binary_scalar;
-        extern const char *arithm_bitwise_binary_scalar_mask;
-        extern const char *arithm_bitwise_not;
-        extern const char *arithm_compare;
-        extern const char *arithm_transpose;
-        extern const char *arithm_flip;
-        extern const char *arithm_flip_rc;
-        extern const char *arithm_magnitude;
-        extern const char *arithm_cartToPolar;
-        extern const char *arithm_polarToCart;
-        extern const char *arithm_exp;
-        extern const char *arithm_log;
-        extern const char *arithm_addWeighted;
-        extern const char *arithm_phase;
-        extern const char *arithm_pow;
-        extern const char *arithm_setidentity;
-    }
-}
 
 //////////////////////////////////////////////////////////////////////////////
 /////////////////////// add subtract multiply divide /////////////////////////
@@ -106,7 +66,7 @@ static void arithmetic_run_generic(const oclMat &src1, const oclMat &src2, const
                             oclMat &dst, int op_type, bool use_scalar = false)
 {
     Context *clCxt = src1.clCxt;
-    bool hasDouble = clCxt->supportsFeature(Context::CL_DOUBLE);
+    bool hasDouble = clCxt->supportsFeature(FEATURE_CL_DOUBLE);
     if (!hasDouble && (src1.depth() == CV_64F || src2.depth() == CV_64F || dst.depth() == CV_64F))
     {
         CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
@@ -264,7 +224,7 @@ void cv::ocl::absdiff(const oclMat &src1, const Scalar &src2, oclMat &dst)
 //////////////////////////////////////////////////////////////////////////////
 
 static void compare_run(const oclMat &src1, const oclMat &src2, oclMat &dst, int cmpOp,
-                        string kernelName, const char **kernelString)
+                        string kernelName, const cv::ocl::ProgramEntry* source)
 {
     CV_Assert(src1.type() == src2.type());
     dst.create(src1.size(), CV_8UC1);
@@ -295,13 +255,13 @@ static void compare_run(const oclMat &src1, const oclMat &src2, oclMat &dst, int
     args.push_back( make_pair( sizeof(cl_int), (void *)&src1.cols ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
 
-    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads,
+    openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads,
                         args, -1, -1, buildOptions.c_str());
 }
 
 void cv::ocl::compare(const oclMat &src1, const oclMat &src2, oclMat &dst , int cmpOp)
 {
-    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.depth() == CV_64F)
+    if (!src1.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src1.depth() == CV_64F)
     {
         cout << "Selected device do not support double" << endl;
         return;
@@ -358,7 +318,7 @@ Scalar arithmetic_sum(const oclMat &src, int type, int ddepth)
 {
     CV_Assert(src.step % src.elemSize() == 0);
 
-    size_t groupnum = src.clCxt->computeUnits();
+    size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits;
     CV_Assert(groupnum != 0);
 
     int dbsize = groupnum * src.oclchannels();
@@ -385,7 +345,7 @@ typedef Scalar (*sumFunc)(const oclMat &src, int type, int ddepth);
 
 Scalar cv::ocl::sum(const oclMat &src)
 {
-    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device doesn't support double");
     }
@@ -396,7 +356,7 @@ Scalar cv::ocl::sum(const oclMat &src)
         arithmetic_sum<double>
     };
 
-    bool hasDouble = src.clCxt->supportsFeature(Context::CL_DOUBLE);
+    bool hasDouble = src.clCxt->supportsFeature(FEATURE_CL_DOUBLE);
     int ddepth = std::max(src.depth(), CV_32S);
     if (!hasDouble && ddepth == CV_64F)
         ddepth = CV_32F;
@@ -407,7 +367,7 @@ Scalar cv::ocl::sum(const oclMat &src)
 
 Scalar cv::ocl::absSum(const oclMat &src)
 {
-    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device doesn't support double");
     }
@@ -418,7 +378,7 @@ Scalar cv::ocl::absSum(const oclMat &src)
         arithmetic_sum<double>
     };
 
-    bool hasDouble = src.clCxt->supportsFeature(Context::CL_DOUBLE);
+    bool hasDouble = src.clCxt->supportsFeature(FEATURE_CL_DOUBLE);
     int ddepth = std::max(src.depth(), CV_32S);
     if (!hasDouble && ddepth == CV_64F)
         ddepth = CV_32F;
@@ -429,7 +389,7 @@ Scalar cv::ocl::absSum(const oclMat &src)
 
 Scalar cv::ocl::sqrSum(const oclMat &src)
 {
-    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device doesn't support double");
     }
@@ -440,7 +400,7 @@ Scalar cv::ocl::sqrSum(const oclMat &src)
         arithmetic_sum<double>
     };
 
-    bool hasDouble = src.clCxt->supportsFeature(Context::CL_DOUBLE);
+    bool hasDouble = src.clCxt->supportsFeature(FEATURE_CL_DOUBLE);
     int ddepth = src.depth() <= CV_32S ? CV_32S : (hasDouble ? CV_64F : CV_32F);
 
     sumFunc func = functab[ddepth - CV_32S];
@@ -524,7 +484,7 @@ template <typename T, typename WT>
 void arithmetic_minMax(const oclMat &src, double *minVal, double *maxVal,
                                              const oclMat &mask, oclMat &buf)
 {
-    size_t groupnum = src.clCxt->computeUnits();
+    size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits;
     CV_Assert(groupnum != 0);
 
     int dbsize = groupnum * 2 * src.elemSize();
@@ -566,7 +526,7 @@ void cv::ocl::minMax_buf(const oclMat &src, double *minVal, double *maxVal, cons
     if (minVal == NULL && maxVal == NULL)
         return;
 
-    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device doesn't support double");
     }
@@ -699,7 +659,7 @@ double cv::ocl::norm(const oclMat &src1, const oclMat &src2, int normType)
 
 static void arithmetic_flip_rows_run(const oclMat &src, oclMat &dst, string kernelName)
 {
-    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
+    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.type() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
@@ -746,7 +706,7 @@ static void arithmetic_flip_rows_run(const oclMat &src, oclMat &dst, string kern
 
 static void arithmetic_flip_cols_run(const oclMat &src, oclMat &dst, string kernelName, bool isVertical)
 {
-    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
+    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.type() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
@@ -792,9 +752,9 @@ static void arithmetic_flip_cols_run(const oclMat &src, oclMat &dst, string kern
 
     args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
 
-    const char **kernelString = isVertical ? &arithm_flip_rc : &arithm_flip;
+    const cv::ocl::ProgramEntry* source = isVertical ? &arithm_flip_rc : &arithm_flip;
 
-    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, src.oclchannels(), depth);
+    openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, src.oclchannels(), depth);
 }
 
 void cv::ocl::flip(const oclMat &src, oclMat &dst, int flipCode)
@@ -860,10 +820,10 @@ void cv::ocl::LUT(const oclMat &src, const oclMat &lut, oclMat &dst)
 //////////////////////////////// exp log /////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
 
-static void arithmetic_exp_log_run(const oclMat &src, oclMat &dst, string kernelName, const char **kernelString)
+static void arithmetic_exp_log_run(const oclMat &src, oclMat &dst, string kernelName, const cv::ocl::ProgramEntry* source)
 {
     Context  *clCxt = src.clCxt;
-    if (!clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
@@ -893,7 +853,7 @@ static void arithmetic_exp_log_run(const oclMat &src, oclMat &dst, string kernel
     args.push_back( make_pair( sizeof(cl_int), (void *)&srcstep1 ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&dststep1 ));
 
-    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads,
+    openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads,
                         args, src.oclchannels(), -1, buildOptions.c_str());
 }
 
@@ -913,7 +873,7 @@ void cv::ocl::log(const oclMat &src, oclMat &dst)
 
 static void arithmetic_magnitude_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName)
 {
-    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
+    if (!src1.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src1.type() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
@@ -955,9 +915,9 @@ void cv::ocl::magnitude(const oclMat &src1, const oclMat &src2, oclMat &dst)
     arithmetic_magnitude_phase_run(src1, src2, dst, "arithm_magnitude");
 }
 
-static void arithmetic_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName, const char **kernelString)
+static void arithmetic_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName, const cv::ocl::ProgramEntry* source)
 {
-    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
+    if (!src1.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src1.type() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
@@ -985,7 +945,7 @@ static void arithmetic_phase_run(const oclMat &src1, const oclMat &src2, oclMat
     args.push_back( make_pair( sizeof(cl_int), (void *)&cols1 ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows ));
 
-    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
+    openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, -1, depth);
 }
 
 void cv::ocl::phase(const oclMat &x, const oclMat &y, oclMat &Angle, bool angleInDegrees)
@@ -1004,7 +964,7 @@ void cv::ocl::phase(const oclMat &x, const oclMat &y, oclMat &Angle, bool angleI
 static void arithmetic_cartToPolar_run(const oclMat &src1, const oclMat &src2, oclMat &dst_mag, oclMat &dst_cart,
                                 string kernelName, bool angleInDegrees)
 {
-    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
+    if (!src1.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src1.type() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
@@ -1057,7 +1017,7 @@ void cv::ocl::cartToPolar(const oclMat &x, const oclMat &y, oclMat &mag, oclMat
 static void arithmetic_ptc_run(const oclMat &src1, const oclMat &src2, oclMat &dst1, oclMat &dst2, bool angleInDegrees,
                         string kernelName)
 {
-    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
+    if (!src1.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src1.type() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
@@ -1176,7 +1136,7 @@ void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
                           Point *minLoc, Point *maxLoc, const oclMat &mask)
 {
     CV_Assert(src.oclchannels() == 1);
-    size_t groupnum = src.clCxt->computeUnits();
+    size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits;
     CV_Assert(groupnum != 0);
     int minloc = -1 , maxloc = -1;
     int vlen = 4, dbsize = groupnum * vlen * 4 * sizeof(T) ;
@@ -1238,7 +1198,7 @@ typedef void (*minMaxLocFunc)(const oclMat &src, double *minVal, double *maxVal,
 void cv::ocl::minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
                         Point *minLoc, Point *maxLoc, const oclMat &mask)
 {
-    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device doesn't support double");
         return;
@@ -1251,7 +1211,7 @@ void cv::ocl::minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
     };
 
     minMaxLocFunc func;
-    func = functab[(int)src.clCxt->supportsFeature(Context::CL_DOUBLE)];
+    func = functab[(int)src.clCxt->supportsFeature(FEATURE_CL_DOUBLE)];
     func(src, minVal, maxVal, minLoc, maxLoc, mask);
 }
 
@@ -1296,7 +1256,7 @@ int cv::ocl::countNonZero(const oclMat &src)
     CV_Assert(src.channels() == 1);
 
     Context *clCxt = src.clCxt;
-    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "selected device doesn't support double");
     }
@@ -1327,7 +1287,7 @@ int cv::ocl::countNonZero(const oclMat &src)
 ////////////////////////////////bitwise_op////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
 
-static void bitwise_unary_run(const oclMat &src1, oclMat &dst, string kernelName, const char **kernelString)
+static void bitwise_unary_run(const oclMat &src1, oclMat &dst, string kernelName, const cv::ocl::ProgramEntry* source)
 {
     dst.create(src1.size(), src1.type());
 
@@ -1361,7 +1321,7 @@ static void bitwise_unary_run(const oclMat &src1, oclMat &dst, string kernelName
     args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
 
-    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
+    openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, -1, depth);
 }
 
 enum { AND = 0, OR, XOR };
@@ -1370,7 +1330,7 @@ static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Sca
                                oclMat &dst, int operationType)
 {
     Context  *clCxt = src1.clCxt;
-    if (!clCxt->supportsFeature(Context::CL_DOUBLE) && src1.depth() == CV_64F)
+    if (!clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src1.depth() == CV_64F)
     {
         cout << "Selected device does not support double" << endl;
         return;
@@ -1442,7 +1402,7 @@ static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Sca
 
 void cv::ocl::bitwise_not(const oclMat &src, oclMat &dst)
 {
-    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
+    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.type() == CV_64F)
     {
         cout << "Selected device does not support double" << endl;
         return;
@@ -1571,7 +1531,7 @@ oclMatExpr::operator oclMat() const
 static void transpose_run(const oclMat &src, oclMat &dst, string kernelName, bool inplace = false)
 {
     Context  *clCxt = src.clCxt;
-    if (!clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
@@ -1623,7 +1583,7 @@ void cv::ocl::transpose(const oclMat &src, oclMat &dst)
 void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2, double beta, double gama, oclMat &dst)
 {
     Context *clCxt = src1.clCxt;
-    bool hasDouble = clCxt->supportsFeature(Context::CL_DOUBLE);
+    bool hasDouble = clCxt->supportsFeature(FEATURE_CL_DOUBLE);
     if (!hasDouble && src1.depth() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
@@ -1688,7 +1648,7 @@ void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2,
 /////////////////////////////////// Pow //////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
 
-static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string kernelName, const char **kernelString)
+static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string kernelName, const cv::ocl::ProgramEntry* source)
 {
     CV_Assert(src1.cols == dst.cols && src1.rows == dst.rows);
     CV_Assert(src1.type() == dst.type());
@@ -1718,17 +1678,17 @@ static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string
     args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
 
     float pf = static_cast<float>(p);
-    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE))
+    if (!src1.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
         args.push_back( make_pair( sizeof(cl_float), (void *)&pf ));
     else
         args.push_back( make_pair( sizeof(cl_double), (void *)&p ));
 
-    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
+    openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, -1, depth);
 }
 
 void cv::ocl::pow(const oclMat &x, double p, oclMat &y)
 {
-    if (!x.clCxt->supportsFeature(Context::CL_DOUBLE) && x.type() == CV_64F)
+    if (!x.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && x.type() == CV_64F)
     {
         cout << "Selected device do not support double" << endl;
         return;
diff --git a/modules/ocl/src/bgfg_mog.cpp b/modules/ocl/src/bgfg_mog.cpp
index 3051ac82f3..cb0dee80f8 100644
--- a/modules/ocl/src/bgfg_mog.cpp
+++ b/modules/ocl/src/bgfg_mog.cpp
@@ -392,7 +392,7 @@ void cv::ocl::device::mog::loadConstants(float Tb, float TB, float Tg, float var
     constants->c_tau = tau;
     constants->c_shadowVal = shadowVal;
 
-    cl_constants = load_constant(*((cl_context*)getoclContext()), *((cl_command_queue*)getoclCommandQueue()),
+    cl_constants = load_constant(*((cl_context*)getClContextPtr()), *((cl_command_queue*)getClCommandQueuePtr()),
         (void *)constants, sizeof(_contant_struct));
 }
 
@@ -635,4 +635,4 @@ void cv::ocl::MOG2::release()
     mean_.release();
 
     bgmodelUsedModes_.release();
-}
\ No newline at end of file
+}
diff --git a/modules/ocl/src/binarycaching.hpp b/modules/ocl/src/binarycaching.hpp
index 0ec565f88b..cc9e71a330 100644
--- a/modules/ocl/src/binarycaching.hpp
+++ b/modules/ocl/src/binarycaching.hpp
@@ -50,41 +50,36 @@ using namespace std;
 using std::cout;
 using std::endl;
 
-namespace cv
+namespace cv { namespace ocl {
+
+class ProgramCache
 {
-    namespace ocl
-    {
-        class ProgramCache
-        {
-        protected:
-            ProgramCache();
-            friend class auto_ptr<ProgramCache>;
-            static auto_ptr<ProgramCache> programCache;
+protected:
+	ProgramCache();
+	~ProgramCache();
+	friend class std::auto_ptr<ProgramCache>;
+public:
+	static ProgramCache *getProgramCache();
 
-        public:
-            ~ProgramCache();
-            static ProgramCache *getProgramCache()
-            {
-                if( NULL == programCache.get())
-                    programCache.reset(new ProgramCache());
-                return programCache.get();
-            }
+	cl_program getProgram(const Context *ctx, const char **source, string kernelName,
+                          const char *build_options);
 
-            //lookup the binary given the file name
-            cl_program progLookup(string srcsign);
+	void releaseProgram();
+protected:
+	//lookup the binary given the file name
+	cl_program progLookup(string srcsign);
 
-            //add program to the cache
-            void addProgram(string srcsign, cl_program program);
-            void releaseProgram();
+	//add program to the cache
+	void addProgram(string srcsign, cl_program program);
 
-            map <string, cl_program> codeCache;
-            unsigned int cacheSize;
-            //The presumed watermark for the cache volume (256MB). Is it enough?
-            //We may need more delicate algorithms when necessary later.
-            //Right now, let's just leave it along.
-            static const unsigned MAX_PROG_CACHE_SIZE = 1024;
-        };
+	map <string, cl_program> codeCache;
+	unsigned int cacheSize;
 
-    }//namespace ocl
+	//The presumed watermark for the cache volume (256MB). Is it enough?
+	//We may need more delicate algorithms when necessary later.
+	//Right now, let's just leave it along.
+	static const unsigned MAX_PROG_CACHE_SIZE = 1024;
+};
 
+}//namespace ocl
 }//namespace cv
diff --git a/modules/ocl/src/brute_force_matcher.cpp b/modules/ocl/src/brute_force_matcher.cpp
index 74da6ddd06..0273ed5891 100644
--- a/modules/ocl/src/brute_force_matcher.cpp
+++ b/modules/ocl/src/brute_force_matcher.cpp
@@ -245,7 +245,7 @@ static void matchDispatcher(const oclMat &query, const oclMat &train, const oclM
 {
     const oclMat zeroMask;
     const oclMat &tempMask = mask.data ? mask : zeroMask;
-    bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
+    bool is_cpu = isCpuDevice();
     if (query.cols <= 64)
     {
         matchUnrolledCached<16, 64>(query, train, tempMask, trainIdx, distance, distType);
@@ -265,7 +265,7 @@ static void matchDispatcher(const oclMat &query, const oclMat *trains, int n, co
 {
     const oclMat zeroMask;
     const oclMat &tempMask = mask.data ? mask : zeroMask;
-    bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
+    bool is_cpu = isCpuDevice();
     if (query.cols <= 64)
     {
         matchUnrolledCached<16, 64>(query, trains, n, tempMask, trainIdx, imgIdx, distance, distType);
@@ -286,7 +286,7 @@ static void matchDispatcher(const oclMat &query, const oclMat &train, float maxD
 {
     const oclMat zeroMask;
     const oclMat &tempMask = mask.data ? mask : zeroMask;
-    bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
+    bool is_cpu = isCpuDevice();
     if (query.cols <= 64)
     {
         matchUnrolledCached<16, 64>(query, train, maxDistance, tempMask, trainIdx, distance, nMatches, distType);
@@ -469,7 +469,7 @@ static void calcDistanceDispatcher(const oclMat &query, const oclMat &train, con
 static void match2Dispatcher(const oclMat &query, const oclMat &train, const oclMat &mask,
                       const oclMat &trainIdx, const oclMat &distance, int distType)
 {
-    bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
+    bool is_cpu = isCpuDevice();
     if (query.cols <= 64)
     {
         knn_matchUnrolledCached<16, 64>(query, train, mask, trainIdx, distance, distType);
diff --git a/modules/ocl/src/canny.cpp b/modules/ocl/src/canny.cpp
index 4c7b988f6f..a25c1973ef 100644
--- a/modules/ocl/src/canny.cpp
+++ b/modules/ocl/src/canny.cpp
@@ -98,7 +98,7 @@ void cv::ocl::CannyBuf::create(const Size &image_size, int apperture_size)
     {
         openCLFree(counter);
     }
-    counter = clCreateBuffer( *((cl_context*)getoclContext()), CL_MEM_COPY_HOST_PTR, sizeof(int), counter_i, &err );
+    counter = clCreateBuffer( *((cl_context*)getClContextPtr()), CL_MEM_COPY_HOST_PTR, sizeof(int), counter_i, &err );
     openCLSafeCall(err);
 }
 
@@ -354,7 +354,7 @@ void canny::edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, void *counter, in
 void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, void *counter, int rows, int cols)
 {
     unsigned int count;
-    openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(float), &count, 0, NULL, NULL));
+    openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getClCommandQueuePtr(), (cl_mem)counter, 1, 0, sizeof(float), &count, 0, NULL, NULL));
     Context *clCxt = map.clCxt;
     string kernelName = "edgesHysteresisGlobal";
     vector< pair<size_t, const void *> > args;
@@ -363,7 +363,7 @@ void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, voi
     int count_i[1] = {0};
     while(count > 0)
     {
-        openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(int), &count_i, 0, NULL, NULL));
+        openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)getClCommandQueuePtr(), (cl_mem)counter, 1, 0, sizeof(int), &count_i, 0, NULL, NULL));
 
         args.clear();
         size_t globalThreads[3] = {std::min(count, 65535u) * 128, divUp(count, 65535), 1};
@@ -378,7 +378,7 @@ void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, voi
         args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset));
 
         openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
-        openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL));
+        openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getClCommandQueuePtr(), (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL));
         std::swap(st1, st2);
     }
 }
diff --git a/modules/ocl/src/cl_context.cpp b/modules/ocl/src/cl_context.cpp
new file mode 100644
index 0000000000..6413465f65
--- /dev/null
+++ b/modules/ocl/src/cl_context.cpp
@@ -0,0 +1,507 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Guoping Long, longguoping@gmail.com
+//    Niko Li, newlife20080214@gmail.com
+//    Yao Wang, bitwangyaoyao@gmail.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include <iomanip>
+#include <fstream>
+#include "binarycaching.hpp"
+
+#undef __CL_ENABLE_EXCEPTIONS
+#include <CL/cl.hpp>
+
+namespace cv { namespace ocl {
+
+extern void fft_teardown();
+extern void clBlasTeardown();
+
+struct PlatformInfoImpl
+{
+    cl_platform_id platform_id;
+
+    std::vector<int> deviceIDs;
+
+    PlatformInfo info;
+
+    PlatformInfoImpl()
+        : platform_id(NULL)
+    {
+    }
+};
+
+struct DeviceInfoImpl
+{
+    cl_platform_id platform_id;
+    cl_device_id device_id;
+
+    DeviceInfo info;
+
+    DeviceInfoImpl()
+        : platform_id(NULL), device_id(NULL)
+    {
+    }
+};
+
+static std::vector<PlatformInfoImpl> global_platforms;
+static std::vector<DeviceInfoImpl> global_devices;
+
+static bool parseOpenCLVersion(const std::string& versionStr, int& major, int& minor)
+{
+    size_t p0 = versionStr.find(' ');
+    while (true)
+    {
+        if (p0 == std::string::npos)
+            break;
+        if (p0 + 1 >= versionStr.length())
+            break;
+        char c = versionStr[p0 + 1];
+        if (isdigit(c))
+            break;
+        p0 = versionStr.find(' ', p0 + 1);
+    }
+    size_t p1 = versionStr.find('.', p0);
+    size_t p2 = versionStr.find(' ', p1);
+    if (p0 == std::string::npos || p1 == std::string::npos || p2 == std::string::npos)
+    {
+        major = 0;
+        minor = 0;
+        return false;
+    }
+    std::string majorStr = versionStr.substr(p0 + 1, p1 - p0 - 1);
+    std::string minorStr = versionStr.substr(p1 + 1, p2 - p1 - 1);
+    major = atoi(majorStr.c_str());
+    minor = atoi(minorStr.c_str());
+    return true;
+}
+
+static int initializeOpenCLDevices()
+{
+    assert(global_devices.size() == 0);
+
+    std::vector<cl::Platform> platforms;
+    try
+    {
+        openCLSafeCall(cl::Platform::get(&platforms));
+    }
+    catch (cv::Exception& e)
+    {
+        return 0; // OpenCL not found
+    }
+
+    global_platforms.resize(platforms.size());
+
+    for (size_t i = 0; i < platforms.size(); ++i)
+    {
+        PlatformInfoImpl& platformInfo = global_platforms[i];
+        platformInfo.info._id = i;
+
+        cl::Platform& platform = platforms[i];
+
+        platformInfo.platform_id = platform();
+        openCLSafeCall(platform.getInfo(CL_PLATFORM_PROFILE, &platformInfo.info.platformProfile));
+        openCLSafeCall(platform.getInfo(CL_PLATFORM_VERSION, &platformInfo.info.platformVersion));
+        openCLSafeCall(platform.getInfo(CL_PLATFORM_NAME, &platformInfo.info.platformName));
+        openCLSafeCall(platform.getInfo(CL_PLATFORM_VENDOR, &platformInfo.info.platformVendor));
+        openCLSafeCall(platform.getInfo(CL_PLATFORM_EXTENSIONS, &platformInfo.info.platformExtensons));
+
+        parseOpenCLVersion(platformInfo.info.platformVersion,
+                platformInfo.info.platformVersionMajor, platformInfo.info.platformVersionMinor);
+
+        std::vector<cl::Device> devices;
+        cl_int status = platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
+        if(status != CL_DEVICE_NOT_FOUND)
+            openCLVerifyCall(status);
+
+        if(devices.size() > 0)
+        {
+            int baseIndx = global_devices.size();
+            global_devices.resize(baseIndx + devices.size());
+            platformInfo.deviceIDs.resize(devices.size());
+            platformInfo.info.devices.resize(devices.size());
+
+            for(size_t j = 0; j < devices.size(); ++j)
+            {
+                cl::Device& device = devices[j];
+
+                DeviceInfoImpl& deviceInfo = global_devices[baseIndx + j];
+                deviceInfo.info._id = baseIndx + j;
+                deviceInfo.platform_id = platform();
+                deviceInfo.device_id = device();
+
+                deviceInfo.info.platform = &platformInfo.info;
+                platformInfo.deviceIDs[j] = deviceInfo.info._id;
+
+                cl_device_type type = -1;
+                openCLSafeCall(device.getInfo(CL_DEVICE_TYPE, &type));
+                deviceInfo.info.deviceType = DeviceType(type);
+
+                openCLSafeCall(device.getInfo(CL_DEVICE_PROFILE, &deviceInfo.info.deviceProfile));
+                openCLSafeCall(device.getInfo(CL_DEVICE_VERSION, &deviceInfo.info.deviceVersion));
+                openCLSafeCall(device.getInfo(CL_DEVICE_NAME, &deviceInfo.info.deviceName));
+                openCLSafeCall(device.getInfo(CL_DEVICE_VENDOR, &deviceInfo.info.deviceVendor));
+                cl_uint vendorID = -1;
+                openCLSafeCall(device.getInfo(CL_DEVICE_VENDOR_ID, &vendorID));
+                deviceInfo.info.deviceVendorId = vendorID;
+                openCLSafeCall(device.getInfo(CL_DRIVER_VERSION, &deviceInfo.info.deviceDriverVersion));
+                openCLSafeCall(device.getInfo(CL_DEVICE_EXTENSIONS, &deviceInfo.info.deviceExtensions));
+
+                parseOpenCLVersion(deviceInfo.info.deviceVersion,
+                        deviceInfo.info.deviceVersionMajor, deviceInfo.info.deviceVersionMinor);
+
+                size_t maxWorkGroupSize = 0;
+                openCLSafeCall(device.getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE, &maxWorkGroupSize));
+                deviceInfo.info.maxWorkGroupSize = maxWorkGroupSize;
+
+                cl_uint maxDimensions = 0;
+                openCLSafeCall(device.getInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, &maxDimensions));
+                std::vector<size_t> maxWorkItemSizes(maxDimensions);
+                openCLSafeCall(clGetDeviceInfo(device(), CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * maxDimensions,
+                        (void *)&maxWorkItemSizes[0], 0));
+                deviceInfo.info.maxWorkItemSizes = maxWorkItemSizes;
+
+                cl_uint maxComputeUnits = 0;
+                openCLSafeCall(device.getInfo(CL_DEVICE_MAX_COMPUTE_UNITS, &maxComputeUnits));
+                deviceInfo.info.maxComputeUnits = maxComputeUnits;
+
+                cl_ulong localMemorySize = 0;
+                openCLSafeCall(device.getInfo(CL_DEVICE_LOCAL_MEM_SIZE, &localMemorySize));
+                deviceInfo.info.localMemorySize = (size_t)localMemorySize;
+
+
+                cl_bool unifiedMemory = false;
+                openCLSafeCall(device.getInfo(CL_DEVICE_HOST_UNIFIED_MEMORY, &unifiedMemory));
+                deviceInfo.info.isUnifiedMemory = unifiedMemory != 0;
+
+                //initialize extra options for compilation. Currently only fp64 is included.
+                //Assume 4KB is enough to store all possible extensions.
+                openCLSafeCall(device.getInfo(CL_DEVICE_EXTENSIONS, &deviceInfo.info.deviceExtensions));
+
+                size_t fp64_khr = deviceInfo.info.deviceExtensions.find("cl_khr_fp64");
+                if(fp64_khr != std::string::npos)
+                {
+                    deviceInfo.info.compilationExtraOptions += "-D DOUBLE_SUPPORT";
+                    deviceInfo.info.haveDoubleSupport = true;
+                }
+                else
+                {
+                    deviceInfo.info.haveDoubleSupport = false;
+                }
+            }
+        }
+    }
+
+    for (size_t i = 0; i < platforms.size(); ++i)
+    {
+        PlatformInfoImpl& platformInfo = global_platforms[i];
+        for(size_t j = 0; j < platformInfo.deviceIDs.size(); ++j)
+        {
+            DeviceInfoImpl& deviceInfo = global_devices[platformInfo.deviceIDs[j]];
+            platformInfo.info.devices[j] = &deviceInfo.info;
+        }
+    }
+
+    return global_devices.size();
+}
+
+
+DeviceInfo::DeviceInfo()
+    : _id(-1), deviceType(DeviceType(0)),
+      deviceVendorId(-1),
+      maxWorkGroupSize(0), maxComputeUnits(0), localMemorySize(0),
+      deviceVersionMajor(0), deviceVersionMinor(0),
+      haveDoubleSupport(false), isUnifiedMemory(false),
+      platform(NULL)
+{
+    // nothing
+}
+
+PlatformInfo::PlatformInfo()
+    : _id(-1),
+      platformVersionMajor(0), platformVersionMinor(0)
+{
+    // nothing
+}
+
+//////////////////////////////// OpenCL context ////////////////////////
+//This is a global singleton class used to represent a OpenCL context.
+class ContextImpl : public Context
+{
+public:
+    const cl_device_id clDeviceID;
+    cl_context clContext;
+    cl_command_queue clCmdQueue;
+    const DeviceInfo& deviceInfo;
+
+protected:
+    ContextImpl(const DeviceInfo& deviceInfo, cl_device_id clDeviceID)
+        : clDeviceID(clDeviceID), clContext(NULL), clCmdQueue(NULL), deviceInfo(deviceInfo)
+    {
+        // nothing
+    }
+    ~ContextImpl();
+public:
+
+    static ContextImpl* getContext();
+    static void setContext(const DeviceInfo* deviceInfo);
+
+    bool supportsFeature(FEATURE_TYPE featureType) const;
+
+    static void cleanupContext(void);
+};
+
+static cv::Mutex currentContextMutex;
+static ContextImpl* currentContext = NULL;
+
+Context* Context::getContext()
+{
+    return currentContext;
+}
+
+bool Context::supportsFeature(FEATURE_TYPE featureType) const
+{
+    return ((ContextImpl*)this)->supportsFeature(featureType);
+}
+
+const DeviceInfo& Context::getDeviceInfo() const
+{
+    return ((ContextImpl*)this)->deviceInfo;
+}
+
+const void* Context::getOpenCLContextPtr() const
+{
+    return &(((ContextImpl*)this)->clContext);
+}
+
+const void* Context::getOpenCLCommandQueuePtr() const
+{
+    return &(((ContextImpl*)this)->clCmdQueue);
+}
+
+const void* Context::getOpenCLDeviceIDPtr() const
+{
+    return &(((ContextImpl*)this)->clDeviceID);
+}
+
+
+bool ContextImpl::supportsFeature(FEATURE_TYPE featureType) const
+{
+    switch (featureType)
+    {
+    case FEATURE_CL_DOUBLE:
+        return deviceInfo.haveDoubleSupport;
+    case FEATURE_CL_UNIFIED_MEM:
+        return deviceInfo.isUnifiedMemory;
+    case FEATURE_CL_VER_1_2:
+        return deviceInfo.deviceVersionMajor > 1 || (deviceInfo.deviceVersionMajor == 1 && deviceInfo.deviceVersionMinor >= 2);
+    }
+    CV_Error(CV_StsBadArg, "Invalid feature type");
+    return false;
+}
+
+#if defined(WIN32)
+static bool __termination = false;
+#endif
+
+ContextImpl::~ContextImpl()
+{
+    fft_teardown();
+    clBlasTeardown();
+
+#ifdef WIN32
+    // if process is on termination stage (ExitProcess was called and other threads were terminated)
+    // then disable command queue release because it may cause program hang
+    if (!__termination)
+#endif
+    {
+        if(clCmdQueue)
+        {
+            openCLSafeCall(clReleaseCommandQueue(clCmdQueue)); // some cleanup problems are here
+        }
+
+        if(clContext)
+        {
+            openCLSafeCall(clReleaseContext(clContext));
+        }
+    }
+    clCmdQueue = NULL;
+    clContext = NULL;
+}
+
+void ContextImpl::cleanupContext(void)
+{
+    cv::AutoLock lock(currentContextMutex);
+    if (currentContext)
+        delete currentContext;
+    currentContext = NULL;
+}
+
+void ContextImpl::setContext(const DeviceInfo* deviceInfo)
+{
+    CV_Assert(deviceInfo->_id >= 0 && deviceInfo->_id < (int)global_devices.size());
+
+    DeviceInfoImpl& infoImpl = global_devices[deviceInfo->_id];
+    CV_Assert(deviceInfo == &infoImpl.info);
+
+    cl_int status = 0;
+    cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(infoImpl.platform_id), 0 };
+    cl_context clContext = clCreateContext(cps, 1, &infoImpl.device_id, NULL, NULL, &status);
+    openCLVerifyCall(status);
+    // TODO add CL_QUEUE_PROFILING_ENABLE
+    cl_command_queue clCmdQueue = clCreateCommandQueue(clContext, infoImpl.device_id, 0, &status);
+    openCLVerifyCall(status);
+
+    ContextImpl* ctx = new ContextImpl(infoImpl.info, infoImpl.device_id);
+    ctx->clCmdQueue = clCmdQueue;
+    ctx->clContext = clContext;
+
+    ContextImpl* old = NULL;
+    {
+        cv::AutoLock lock(currentContextMutex);
+        old = currentContext;
+        currentContext = ctx;
+    }
+    if (old != NULL)
+    {
+        delete old;
+    }
+}
+
+ContextImpl* ContextImpl::getContext()
+{
+    return currentContext;
+}
+
+int getOpenCLPlatforms(PlatformsInfo& platforms)
+{
+    platforms.clear();
+
+    for (size_t id = 0; id < global_platforms.size(); ++id)
+    {
+        PlatformInfoImpl& impl = global_platforms[id];
+        platforms.push_back(&impl.info);
+    }
+
+    return platforms.size();
+}
+
+int getOpenCLDevices(std::vector<const DeviceInfo*> &devices, int deviceType, const PlatformInfo* platform)
+{
+    devices.clear();
+
+    switch(deviceType)
+    {
+    case CVCL_DEVICE_TYPE_DEFAULT:
+    case CVCL_DEVICE_TYPE_CPU:
+    case CVCL_DEVICE_TYPE_GPU:
+    case CVCL_DEVICE_TYPE_ACCELERATOR:
+    case CVCL_DEVICE_TYPE_ALL:
+        break;
+    default:
+        return 0;
+    }
+
+    if (platform == NULL)
+    {
+        for (size_t id = 0; id < global_devices.size(); ++id)
+        {
+            DeviceInfoImpl& deviceInfo = global_devices[id];
+            if (((int)deviceInfo.info.deviceType & deviceType) == deviceType)
+            {
+                devices.push_back(&deviceInfo.info);
+            }
+        }
+    }
+    else
+    {
+        for (size_t id = 0; id < platform->devices.size(); ++id)
+        {
+            const DeviceInfo* deviceInfo = platform->devices[id];
+            if (((int)deviceInfo->deviceType & deviceType) == deviceType)
+            {
+                devices.push_back(deviceInfo);
+            }
+        }
+    }
+
+    return (int)devices.size();
+}
+
+void setDevice(const DeviceInfo* info)
+{
+    ContextImpl::setContext(info);
+}
+
+bool supportsFeature(FEATURE_TYPE featureType)
+{
+    return Context::getContext()->supportsFeature(featureType);
+}
+
+struct __Module
+{
+    __Module() { initializeOpenCLDevices(); }
+    ~__Module() { ContextImpl::cleanupContext(); }
+};
+static __Module __module;
+
+
+}//namespace ocl
+}//namespace cv
+
+
+#if defined(WIN32) && defined(CVAPI_EXPORTS)
+
+extern "C"
+BOOL WINAPI DllMain(HINSTANCE /*hInst*/, DWORD fdwReason, LPVOID lpReserved)
+{
+    if (fdwReason == DLL_PROCESS_DETACH)
+    {
+        if (lpReserved != NULL) // called after ExitProcess() call
+            cv::ocl::__termination = true;
+    }
+    return TRUE;
+}
+
+#endif
diff --git a/modules/ocl/src/cl_operations.cpp b/modules/ocl/src/cl_operations.cpp
new file mode 100644
index 0000000000..42138adbe0
--- /dev/null
+++ b/modules/ocl/src/cl_operations.cpp
@@ -0,0 +1,434 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Guoping Long, longguoping@gmail.com
+//    Niko Li, newlife20080214@gmail.com
+//    Yao Wang, bitwangyaoyao@gmail.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include <iomanip>
+#include <fstream>
+#include "binarycaching.hpp"
+
+#undef __CL_ENABLE_EXCEPTIONS
+#include <CL/cl.hpp>
+
+//#define PRINT_KERNEL_RUN_TIME
+#define RUN_TIMES 100
+#ifndef CL_MEM_USE_PERSISTENT_MEM_AMD
+#define CL_MEM_USE_PERSISTENT_MEM_AMD 0
+#endif
+//#define AMD_DOUBLE_DIFFER
+
+namespace cv { namespace ocl {
+
+DevMemType gDeviceMemType = DEVICE_MEM_DEFAULT;
+DevMemRW gDeviceMemRW = DEVICE_MEM_R_W;
+int gDevMemTypeValueMap[5] = {0,
+                              CL_MEM_ALLOC_HOST_PTR,
+                              CL_MEM_USE_HOST_PTR,
+                              CL_MEM_COPY_HOST_PTR,
+                              CL_MEM_USE_PERSISTENT_MEM_AMD};
+int gDevMemRWValueMap[3] = {CL_MEM_READ_WRITE, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY};
+
+void finish()
+{
+    clFinish(getClCommandQueue(Context::getContext()));
+}
+
+bool isCpuDevice()
+{
+    const DeviceInfo& info = Context::getContext()->getDeviceInfo();
+    return (info.deviceType == CVCL_DEVICE_TYPE_CPU);
+}
+
+size_t queryWaveFrontSize(cl_kernel kernel)
+{
+    const DeviceInfo& info = Context::getContext()->getDeviceInfo();
+    if (info.deviceType == CVCL_DEVICE_TYPE_CPU)
+        return 1;
+    size_t wavefront = 0;
+    CV_Assert(kernel != NULL);
+    openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(Context::getContext()),
+            CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &wavefront, NULL));
+    return wavefront;
+}
+
+
+void openCLReadBuffer(Context *ctx, cl_mem dst_buffer, void *host_buffer, size_t size)
+{
+    cl_int status;
+    status = clEnqueueReadBuffer(getClCommandQueue(ctx), dst_buffer, CL_TRUE, 0,
+                                 size, host_buffer, 0, NULL, NULL);
+    openCLVerifyCall(status);
+}
+
+cl_mem openCLCreateBuffer(Context *ctx, size_t flag , size_t size)
+{
+    cl_int status;
+    cl_mem buffer = clCreateBuffer(getClContext(ctx), (cl_mem_flags)flag, size, NULL, &status);
+    openCLVerifyCall(status);
+    return buffer;
+}
+
+void openCLMallocPitch(Context *ctx, void **dev_ptr, size_t *pitch,
+                       size_t widthInBytes, size_t height)
+{
+    openCLMallocPitchEx(ctx, dev_ptr, pitch, widthInBytes, height, gDeviceMemRW, gDeviceMemType);
+}
+
+void openCLMallocPitchEx(Context *ctx, void **dev_ptr, size_t *pitch,
+                       size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type)
+{
+    cl_int status;
+    *dev_ptr = clCreateBuffer(getClContext(ctx), gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
+                              widthInBytes * height, 0, &status);
+    openCLVerifyCall(status);
+    *pitch = widthInBytes;
+}
+
+void openCLMemcpy2D(Context *ctx, void *dst, size_t dpitch,
+                    const void *src, size_t spitch,
+                    size_t width, size_t height, openCLMemcpyKind kind, int channels)
+{
+    size_t buffer_origin[3] = {0, 0, 0};
+    size_t host_origin[3] = {0, 0, 0};
+    size_t region[3] = {width, height, 1};
+    if(kind == clMemcpyHostToDevice)
+    {
+        if(dpitch == width || channels == 3 || height == 1)
+        {
+            openCLSafeCall(clEnqueueWriteBuffer(getClCommandQueue(ctx), (cl_mem)dst, CL_TRUE,
+                                                0, width * height, src, 0, NULL, NULL));
+        }
+        else
+        {
+            openCLSafeCall(clEnqueueWriteBufferRect(getClCommandQueue(ctx), (cl_mem)dst, CL_TRUE,
+                                                    buffer_origin, host_origin, region, dpitch, 0, spitch, 0, src, 0, 0, 0));
+        }
+    }
+    else if(kind == clMemcpyDeviceToHost)
+    {
+        if(spitch == width || channels == 3 || height == 1)
+        {
+            openCLSafeCall(clEnqueueReadBuffer(getClCommandQueue(ctx), (cl_mem)src, CL_TRUE,
+                                               0, width * height, dst, 0, NULL, NULL));
+        }
+        else
+        {
+            openCLSafeCall(clEnqueueReadBufferRect(getClCommandQueue(ctx), (cl_mem)src, CL_TRUE,
+                                                   buffer_origin, host_origin, region, spitch, 0, dpitch, 0, dst, 0, 0, 0));
+        }
+    }
+}
+
+void openCLCopyBuffer2D(Context *ctx, void *dst, size_t dpitch, int dst_offset,
+                        const void *src, size_t spitch,
+                        size_t width, size_t height, int src_offset)
+{
+    size_t src_origin[3] = {src_offset % spitch, src_offset / spitch, 0};
+    size_t dst_origin[3] = {dst_offset % dpitch, dst_offset / dpitch, 0};
+    size_t region[3] = {width, height, 1};
+
+    openCLSafeCall(clEnqueueCopyBufferRect(getClCommandQueue(ctx), (cl_mem)src, (cl_mem)dst, src_origin, dst_origin,
+                                           region, spitch, 0, dpitch, 0, 0, 0, 0));
+}
+
+void openCLFree(void *devPtr)
+{
+    openCLSafeCall(clReleaseMemObject((cl_mem)devPtr));
+}
+
+cl_kernel openCLGetKernelFromSource(const Context *ctx, const char **source, string kernelName)
+{
+    return openCLGetKernelFromSource(ctx, source, kernelName, NULL);
+}
+
+cl_kernel openCLGetKernelFromSource(const Context *ctx, const char **source, string kernelName,
+                                    const char *build_options)
+{
+    cl_kernel kernel;
+    cl_int status = 0;
+    CV_Assert(ProgramCache::getProgramCache() != NULL);
+    cl_program program = ProgramCache::getProgramCache()->getProgram(ctx, source, kernelName, build_options);
+    CV_Assert(program != NULL);
+    kernel = clCreateKernel(program, kernelName.c_str(), &status);
+    openCLVerifyCall(status);
+    return kernel;
+}
+
+void openCLVerifyKernel(const Context *ctx, cl_kernel kernel, size_t *localThreads)
+{
+    size_t kernelWorkGroupSize;
+    openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(ctx),
+                                            CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
+    CV_Assert( localThreads[0] <= ctx->getDeviceInfo().maxWorkItemSizes[0] );
+    CV_Assert( localThreads[1] <= ctx->getDeviceInfo().maxWorkItemSizes[1] );
+    CV_Assert( localThreads[2] <= ctx->getDeviceInfo().maxWorkItemSizes[2] );
+    CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= kernelWorkGroupSize );
+    CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= ctx->getDeviceInfo().maxWorkGroupSize );
+}
+
+#ifdef PRINT_KERNEL_RUN_TIME
+static double total_execute_time = 0;
+static double total_kernel_time = 0;
+#endif
+void openCLExecuteKernel_(Context *ctx , const char **source, string kernelName, size_t globalThreads[3],
+                          size_t localThreads[3],  vector< pair<size_t, const void *> > &args, int channels,
+                          int depth, const char *build_options)
+{
+    //construct kernel name
+    //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number
+    //for exmaple split_C2_D2, represent the split kernel with channels =2 and dataType Depth = 2(Data type is char)
+    stringstream idxStr;
+    if(channels != -1)
+        idxStr << "_C" << channels;
+    if(depth != -1)
+        idxStr << "_D" << depth;
+    kernelName += idxStr.str();
+
+    cl_kernel kernel;
+    kernel = openCLGetKernelFromSource(ctx, source, kernelName, build_options);
+
+    if ( localThreads != NULL)
+    {
+        globalThreads[0] = roundUp(globalThreads[0], localThreads[0]);
+        globalThreads[1] = roundUp(globalThreads[1], localThreads[1]);
+        globalThreads[2] = roundUp(globalThreads[2], localThreads[2]);
+
+        cv::ocl::openCLVerifyKernel(ctx, kernel, localThreads);
+    }
+    for(size_t i = 0; i < args.size(); i ++)
+        openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
+
+#ifndef PRINT_KERNEL_RUN_TIME
+    openCLSafeCall(clEnqueueNDRangeKernel(getClCommandQueue(ctx), kernel, 3, NULL, globalThreads,
+                                          localThreads, 0, NULL, NULL));
+#else
+    cl_event event = NULL;
+    openCLSafeCall(clEnqueueNDRangeKernel(getClCommandQueue(ctx), kernel, 3, NULL, globalThreads,
+                                          localThreads, 0, NULL, &event));
+
+    cl_ulong start_time, end_time, queue_time;
+    double execute_time = 0;
+    double total_time   = 0;
+
+    openCLSafeCall(clWaitForEvents(1, &event));
+    openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
+                                           sizeof(cl_ulong), &start_time, 0));
+
+    openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
+                                           sizeof(cl_ulong), &end_time, 0));
+
+    openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED,
+                                           sizeof(cl_ulong), &queue_time, 0));
+
+    execute_time = (double)(end_time - start_time) / (1000 * 1000);
+    total_time = (double)(end_time - queue_time) / (1000 * 1000);
+
+    total_execute_time += execute_time;
+    total_kernel_time += total_time;
+    clReleaseEvent(event);
+#endif
+
+    clFlush(getClCommandQueue(ctx));
+    openCLSafeCall(clReleaseKernel(kernel));
+}
+
+void openCLExecuteKernel(Context *ctx , const char **source, string kernelName,
+                         size_t globalThreads[3], size_t localThreads[3],
+                         vector< pair<size_t, const void *> > &args, int channels, int depth)
+{
+    openCLExecuteKernel(ctx, source, kernelName, globalThreads, localThreads, args,
+                        channels, depth, NULL);
+}
+void openCLExecuteKernel(Context *ctx , const char **source, string kernelName,
+                         size_t globalThreads[3], size_t localThreads[3],
+                         vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options)
+
+{
+#ifndef PRINT_KERNEL_RUN_TIME
+    openCLExecuteKernel_(ctx, source, kernelName, globalThreads, localThreads, args, channels, depth,
+                         build_options);
+#else
+    string data_type[] = { "uchar", "char", "ushort", "short", "int", "float", "double"};
+    cout << endl;
+    cout << "Function Name: " << kernelName;
+    if(depth >= 0)
+        cout << " |data type: " << data_type[depth];
+    cout << " |channels: " << channels;
+    cout << " |Time Unit: " << "ms" << endl;
+
+    total_execute_time = 0;
+    total_kernel_time = 0;
+    cout << "-------------------------------------" << endl;
+
+    cout << setiosflags(ios::left) << setw(15) << "excute time";
+    cout << setiosflags(ios::left) << setw(15) << "lauch time";
+    cout << setiosflags(ios::left) << setw(15) << "kernel time" << endl;
+    int i = 0;
+    for(i = 0; i < RUN_TIMES; i++)
+        openCLExecuteKernel_(ctx, source, kernelName, globalThreads, localThreads, args, channels, depth,
+                             build_options);
+
+    cout << "average kernel excute time: " << total_execute_time / RUN_TIMES << endl; // "ms" << endl;
+    cout << "average kernel total time:  " << total_kernel_time / RUN_TIMES << endl; // "ms" << endl;
+#endif
+}
+
+double openCLExecuteKernelInterop(Context *ctx , const char **source, string kernelName,
+                         size_t globalThreads[3], size_t localThreads[3],
+                         vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options,
+                         bool finish, bool measureKernelTime, bool cleanUp)
+
+{
+    //construct kernel name
+    //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number
+    //for exmaple split_C2_D2, represent the split kernel with channels =2 and dataType Depth = 2(Data type is char)
+    stringstream idxStr;
+    if(channels != -1)
+        idxStr << "_C" << channels;
+    if(depth != -1)
+        idxStr << "_D" << depth;
+    kernelName += idxStr.str();
+
+    cl_kernel kernel;
+    kernel = openCLGetKernelFromSource(ctx, source, kernelName, build_options);
+
+    double kernelTime = 0.0;
+
+    if( globalThreads != NULL)
+    {
+        if ( localThreads != NULL)
+        {
+            globalThreads[0] = divUp(globalThreads[0], localThreads[0]) * localThreads[0];
+            globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1];
+            globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2];
+
+            //size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2];
+            cv::ocl::openCLVerifyKernel(ctx, kernel, localThreads);
+        }
+        for(size_t i = 0; i < args.size(); i ++)
+            openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
+
+        if(measureKernelTime == false)
+        {
+            openCLSafeCall(clEnqueueNDRangeKernel(getClCommandQueue(ctx), kernel, 3, NULL, globalThreads,
+                            localThreads, 0, NULL, NULL));
+        }
+        else
+        {
+            cl_event event = NULL;
+            openCLSafeCall(clEnqueueNDRangeKernel(getClCommandQueue(ctx), kernel, 3, NULL, globalThreads,
+                            localThreads, 0, NULL, &event));
+
+            cl_ulong end_time, queue_time;
+
+            openCLSafeCall(clWaitForEvents(1, &event));
+
+            openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
+                            sizeof(cl_ulong), &end_time, 0));
+
+            openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED,
+                            sizeof(cl_ulong), &queue_time, 0));
+
+            kernelTime = (double)(end_time - queue_time) / (1000 * 1000);
+
+            clReleaseEvent(event);
+        }
+    }
+
+    if(finish)
+    {
+        clFinish(getClCommandQueue(ctx));
+    }
+
+    if(cleanUp)
+    {
+        openCLSafeCall(clReleaseKernel(kernel));
+    }
+
+    return kernelTime;
+}
+
+//double openCLExecuteKernelInterop(Context *ctx , const char **fileName, const int numFiles, string kernelName,
+//                         size_t globalThreads[3], size_t localThreads[3],
+//                         vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options,
+//                         bool finish, bool measureKernelTime, bool cleanUp)
+//
+//{
+//    std::vector<std::string> fsource;
+//    for (int i = 0 ; i < numFiles ; i++)
+//    {
+//        std::string str;
+//        if (convertToString(fileName[i], str) >= 0)
+//            fsource.push_back(str);
+//    }
+//    const char **source = new const char *[numFiles];
+//    for (int i = 0 ; i < numFiles ; i++)
+//        source[i] = fsource[i].c_str();
+//    double kernelTime = openCLExecuteKernelInterop(ctx ,source, kernelName, globalThreads, localThreads,
+//                         args, channels, depth, build_options, finish, measureKernelTime, cleanUp);
+//    fsource.clear();
+//    delete []source;
+//    return kernelTime;
+//}
+
+cl_mem load_constant(cl_context context, cl_command_queue command_queue, const void *value,
+                     const size_t size)
+{
+    int status;
+    cl_mem con_struct;
+
+    con_struct = clCreateBuffer(context, CL_MEM_READ_ONLY, size, NULL, &status);
+    openCLSafeCall(status);
+
+    openCLSafeCall(clEnqueueWriteBuffer(command_queue, con_struct, 1, 0, size,
+                                        value, 0, 0, 0));
+
+    return con_struct;
+
+}
+
+}//namespace ocl
+}//namespace cv
diff --git a/modules/ocl/src/cl_programcache.cpp b/modules/ocl/src/cl_programcache.cpp
new file mode 100644
index 0000000000..3261319c34
--- /dev/null
+++ b/modules/ocl/src/cl_programcache.cpp
@@ -0,0 +1,311 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Guoping Long, longguoping@gmail.com
+//    Niko Li, newlife20080214@gmail.com
+//    Yao Wang, bitwangyaoyao@gmail.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include <iomanip>
+#include <fstream>
+#include "binarycaching.hpp"
+
+#undef __CL_ENABLE_EXCEPTIONS
+#include <CL/cl.hpp>
+
+namespace cv { namespace ocl {
+/*
+ * The binary caching system to eliminate redundant program source compilation.
+ * Strictly, this is not a cache because we do not implement evictions right now.
+ * We shall add such features to trade-off memory consumption and performance when necessary.
+ */
+
+std::auto_ptr<ProgramCache> _programCache;
+ProgramCache* ProgramCache::getProgramCache()
+{
+	if (NULL == _programCache.get())
+		_programCache.reset(new ProgramCache());
+	return _programCache.get();
+}
+
+ProgramCache::ProgramCache()
+{
+	codeCache.clear();
+	cacheSize = 0;
+}
+
+ProgramCache::~ProgramCache()
+{
+	releaseProgram();
+}
+
+cl_program ProgramCache::progLookup(string srcsign)
+{
+	map<string, cl_program>::iterator iter;
+	iter = codeCache.find(srcsign);
+	if(iter != codeCache.end())
+		return iter->second;
+	else
+		return NULL;
+}
+
+void ProgramCache::addProgram(string srcsign , cl_program program)
+{
+	if(!progLookup(srcsign))
+	{
+		codeCache.insert(map<string, cl_program>::value_type(srcsign, program));
+	}
+}
+
+void ProgramCache::releaseProgram()
+{
+	map<string, cl_program>::iterator iter;
+	for(iter = codeCache.begin(); iter != codeCache.end(); iter++)
+	{
+		openCLSafeCall(clReleaseProgram(iter->second));
+	}
+	codeCache.clear();
+	cacheSize = 0;
+}
+
+static int enable_disk_cache =
+#ifdef _DEBUG
+        false;
+#else
+        true;
+#endif
+static int update_disk_cache = false;
+static String binpath = "";
+
+void setBinaryDiskCache(int mode, String path)
+{
+    if(mode == CACHE_NONE)
+    {
+        update_disk_cache = 0;
+        enable_disk_cache = 0;
+        return;
+    }
+    update_disk_cache |= (mode & CACHE_UPDATE) == CACHE_UPDATE;
+    enable_disk_cache |=
+#ifdef _DEBUG
+        (mode & CACHE_DEBUG)   == CACHE_DEBUG;
+#else
+        (mode & CACHE_RELEASE) == CACHE_RELEASE;
+#endif
+    if(enable_disk_cache && !path.empty())
+    {
+        binpath = path;
+    }
+}
+
+void setBinpath(const char *path)
+{
+    binpath = path;
+}
+
+int savetofile(const Context*,  cl_program &program, const char *fileName)
+{
+    size_t binarySize;
+    openCLSafeCall(clGetProgramInfo(program,
+                            CL_PROGRAM_BINARY_SIZES,
+                            sizeof(size_t),
+                            &binarySize, NULL));
+    char* binary = (char*)malloc(binarySize);
+    if(binary == NULL)
+    {
+        CV_Error(CV_StsNoMem, "Failed to allocate host memory.");
+    }
+    openCLSafeCall(clGetProgramInfo(program,
+                            CL_PROGRAM_BINARIES,
+                            sizeof(char *),
+                            &binary,
+                            NULL));
+
+    FILE *fp = fopen(fileName, "wb+");
+    if(fp != NULL)
+    {
+        fwrite(binary, binarySize, 1, fp);
+        free(binary);
+        fclose(fp);
+    }
+    return 1;
+}
+
+cl_program ProgramCache::getProgram(const Context *ctx, const char **source, string kernelName,
+                                    const char *build_options)
+{
+    cl_program program;
+    cl_int status = 0;
+    stringstream src_sign;
+    string srcsign;
+    string filename;
+
+    if (NULL != build_options)
+    {
+        src_sign << (int64)(*source) << getClContext(ctx) << "_" << build_options;
+    }
+    else
+    {
+        src_sign << (int64)(*source) << getClContext(ctx);
+    }
+    srcsign = src_sign.str();
+
+    program = NULL;
+    program = ProgramCache::getProgramCache()->progLookup(srcsign);
+
+    if (!program)
+    {
+        //config build programs
+        std::string all_build_options;
+        if (!ctx->getDeviceInfo().compilationExtraOptions.empty())
+            all_build_options += ctx->getDeviceInfo().compilationExtraOptions;
+        if (build_options != NULL)
+        {
+            all_build_options += " ";
+            all_build_options += build_options;
+        }
+        filename = binpath + kernelName + "_" + ctx->getDeviceInfo().deviceName + all_build_options + ".clb";
+
+        FILE *fp = enable_disk_cache ? fopen(filename.c_str(), "rb") : NULL;
+        if(fp == NULL || update_disk_cache)
+        {
+            if(fp != NULL)
+                fclose(fp);
+
+            program = clCreateProgramWithSource(
+                          getClContext(ctx), 1, source, NULL, &status);
+            openCLVerifyCall(status);
+            cl_device_id device = getClDeviceID(ctx);
+            status = clBuildProgram(program, 1, &device, all_build_options.c_str(), NULL, NULL);
+            if(status == CL_SUCCESS && enable_disk_cache)
+                savetofile(ctx, program, filename.c_str());
+        }
+        else
+        {
+            fseek(fp, 0, SEEK_END);
+            size_t binarySize = ftell(fp);
+            fseek(fp, 0, SEEK_SET);
+            char *binary = new char[binarySize];
+            CV_Assert(1 == fread(binary, binarySize, 1, fp));
+            fclose(fp);
+            cl_int status = 0;
+            cl_device_id device = getClDeviceID(ctx);
+            program = clCreateProgramWithBinary(getClContext(ctx),
+                                                1,
+                                                &device,
+                                                (const size_t *)&binarySize,
+                                                (const unsigned char **)&binary,
+                                                NULL,
+                                                &status);
+            openCLVerifyCall(status);
+            status = clBuildProgram(program, 1, &device, all_build_options.c_str(), NULL, NULL);
+            delete[] binary;
+        }
+
+        if(status != CL_SUCCESS)
+        {
+            if(status == CL_BUILD_PROGRAM_FAILURE)
+            {
+                cl_int logStatus;
+                char *buildLog = NULL;
+                size_t buildLogSize = 0;
+                logStatus = clGetProgramBuildInfo(program,
+                        getClDeviceID(ctx), CL_PROGRAM_BUILD_LOG, buildLogSize,
+                        buildLog, &buildLogSize);
+                if(logStatus != CL_SUCCESS)
+                    std::cout << "Failed to build the program and get the build info." << endl;
+                buildLog = new char[buildLogSize];
+                CV_DbgAssert(!!buildLog);
+                memset(buildLog, 0, buildLogSize);
+                openCLSafeCall(clGetProgramBuildInfo(program, getClDeviceID(ctx),
+                                                     CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL));
+                std::cout << "\n\t\t\tBUILD LOG\n";
+                std::cout << buildLog << endl;
+                delete [] buildLog;
+            }
+            openCLVerifyCall(status);
+        }
+        //Cache the binary for future use if build_options is null
+        if( (this->cacheSize += 1) < MAX_PROG_CACHE_SIZE)
+            this->addProgram(srcsign, program);
+        else
+            cout << "Warning: code cache has been full.\n";
+    }
+    return program;
+}
+
+//// Converts the contents of a file into a string
+//static int convertToString(const char *filename, std::string& s)
+//{
+//    size_t size;
+//    char*  str;
+//
+//    std::fstream f(filename, (std::fstream::in | std::fstream::binary));
+//    if(f.is_open())
+//    {
+//        size_t fileSize;
+//        f.seekg(0, std::fstream::end);
+//        size = fileSize = (size_t)f.tellg();
+//        f.seekg(0, std::fstream::beg);
+//
+//        str = new char[size+1];
+//        if(!str)
+//        {
+//            f.close();
+//            return -1;
+//        }
+//
+//        f.read(str, fileSize);
+//        f.close();
+//        str[size] = '\0';
+//
+//        s = str;
+//        delete[] str;
+//        return 0;
+//    }
+//    printf("Error: Failed to open file %s\n", filename);
+//    return -1;
+//}
+
+} // namespace ocl
+} // namespace cv
diff --git a/modules/ocl/src/error.cpp b/modules/ocl/src/error.cpp
index e854e70cd0..cd6d3d5346 100644
--- a/modules/ocl/src/error.cpp
+++ b/modules/ocl/src/error.cpp
@@ -152,19 +152,19 @@ namespace cv
             case CL_INVALID_GLOBAL_WORK_SIZE:
                 return "CL_INVALID_GLOBAL_WORK_SIZE";
                 //case CL_INVALID_PROPERTY:
-                //	return "CL_INVALID_PROPERTY";
+                //    return "CL_INVALID_PROPERTY";
                 //case CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR:
-                //	return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
+                //    return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
                 //case CL_PLATFORM_NOT_FOUND_KHR:
-                //	return "CL_PLATFORM_NOT_FOUND_KHR";
-                //	//case CL_INVALID_PROPERTY_EXT:
-                //	//    return "CL_INVALID_PROPERTY_EXT";
+                //    return "CL_PLATFORM_NOT_FOUND_KHR";
+                //    //case CL_INVALID_PROPERTY_EXT:
+                //    //    return "CL_INVALID_PROPERTY_EXT";
                 //case CL_DEVICE_PARTITION_FAILED_EXT:
-                //	return "CL_DEVICE_PARTITION_FAILED_EXT";
+                //    return "CL_DEVICE_PARTITION_FAILED_EXT";
                 //case CL_INVALID_PARTITION_COUNT_EXT:
-                //	return "CL_INVALID_PARTITION_COUNT_EXT";
+                //    return "CL_INVALID_PARTITION_COUNT_EXT";
                 //default:
-                //	return "unknown error code";
+                //    return "unknown error code";
             }
             static char buf[256];
             sprintf(buf, "%d", err);
diff --git a/modules/ocl/src/fft.cpp b/modules/ocl/src/fft.cpp
index b6cc070fb5..c0785ac9d8 100644
--- a/modules/ocl/src/fft.cpp
+++ b/modules/ocl/src/fft.cpp
@@ -156,25 +156,25 @@ cv::ocl::FftPlan::FftPlan(Size _dft_size, int _src_step, int _dst_step, int _fla
 {
     fft_setup();
 
-    bool is_1d_input	= (_dft_size.height == 1);
-    int is_row_dft		= flags & DFT_ROWS;
+    bool is_1d_input    = (_dft_size.height == 1);
+    int is_row_dft        = flags & DFT_ROWS;
     int is_scaled_dft   = flags & DFT_SCALE;
-    int is_inverse		= flags & DFT_INVERSE;
+    int is_inverse        = flags & DFT_INVERSE;
 
-    //clAmdFftResultLocation	place;
-    clAmdFftLayout			inLayout;
-    clAmdFftLayout			outLayout;
-    clAmdFftDim				dim = is_1d_input || is_row_dft ? CLFFT_1D : CLFFT_2D;
+    //clAmdFftResultLocation    place;
+    clAmdFftLayout            inLayout;
+    clAmdFftLayout            outLayout;
+    clAmdFftDim                dim = is_1d_input || is_row_dft ? CLFFT_1D : CLFFT_2D;
 
-    size_t batchSize		 = is_row_dft ? dft_size.height : 1;
+    size_t batchSize         = is_row_dft ? dft_size.height : 1;
     size_t clLengthsIn[ 3 ]  = {1, 1, 1};
     size_t clStridesIn[ 3 ]  = {1, 1, 1};
     //size_t clLengthsOut[ 3 ] = {1, 1, 1};
     size_t clStridesOut[ 3 ] = {1, 1, 1};
-    clLengthsIn[0]			 = dft_size.width;
-    clLengthsIn[1]			 = is_row_dft ? 1 : dft_size.height;
-    clStridesIn[0]			 = 1;
-    clStridesOut[0]			 = 1;
+    clLengthsIn[0]             = dft_size.width;
+    clLengthsIn[1]             = is_row_dft ? 1 : dft_size.height;
+    clStridesIn[0]             = 1;
+    clStridesOut[0]             = 1;
 
     switch(_type)
     {
@@ -206,7 +206,7 @@ cv::ocl::FftPlan::FftPlan(Size _dft_size, int _src_step, int _dst_step, int _fla
     clStridesIn[2]  = is_row_dft ? clStridesIn[1]  : dft_size.width * clStridesIn[1];
     clStridesOut[2] = is_row_dft ? clStridesOut[1] : dft_size.width * clStridesOut[1];
 
-    openCLSafeCall( clAmdFftCreateDefaultPlan( &plHandle, *(cl_context*)getoclContext(), dim, clLengthsIn ) );
+    openCLSafeCall( clAmdFftCreateDefaultPlan( &plHandle, *(cl_context*)getClContextPtr(), dim, clLengthsIn ) );
 
     openCLSafeCall( clAmdFftSetResultLocation( plHandle, CLFFT_OUTOFPLACE ) );
     openCLSafeCall( clAmdFftSetLayout( plHandle, inLayout, outLayout ) );
@@ -220,7 +220,7 @@ cv::ocl::FftPlan::FftPlan(Size _dft_size, int _src_step, int _dst_step, int _fla
     openCLSafeCall( clAmdFftSetPlanScale  ( plHandle, is_inverse ? CLFFT_BACKWARD : CLFFT_FORWARD, scale_ ) );
 
     //ready to bake
-    openCLSafeCall( clAmdFftBakePlan( plHandle, 1, (cl_command_queue*)getoclCommandQueue(), NULL, NULL ) );
+    openCLSafeCall( clAmdFftBakePlan( plHandle, 1, (cl_command_queue*)getClCommandQueuePtr(), NULL, NULL ) );
 }
 cv::ocl::FftPlan::~FftPlan()
 {
@@ -296,12 +296,12 @@ void cv::ocl::dft(const oclMat &src, oclMat &dst, Size dft_size, int flags)
     // similar assertions with cuda module
     CV_Assert(src.type() == CV_32F || src.type() == CV_32FC2);
 
-    //bool is_1d_input	= (src.rows == 1);
-    //int is_row_dft		= flags & DFT_ROWS;
-    //int is_scaled_dft		= flags & DFT_SCALE;
-    int is_inverse			= flags & DFT_INVERSE;
-    bool is_complex_input	= src.channels() == 2;
-    bool is_complex_output	= !(flags & DFT_REAL_OUTPUT);
+    //bool is_1d_input    = (src.rows == 1);
+    //int is_row_dft        = flags & DFT_ROWS;
+    //int is_scaled_dft        = flags & DFT_SCALE;
+    int is_inverse = flags & DFT_INVERSE;
+    bool is_complex_input = src.channels() == 2;
+    bool is_complex_output = !(flags & DFT_REAL_OUTPUT);
 
 
     // We don't support real-to-real transform
@@ -338,10 +338,10 @@ void cv::ocl::dft(const oclMat &src, oclMat &dst, Size dft_size, int flags)
     if (buffersize)
     {
         cl_int medstatus;
-        clMedBuffer = clCreateBuffer ( (cl_context)src.clCxt->oclContext(), CL_MEM_READ_WRITE, buffersize, 0, &medstatus);
+        clMedBuffer = clCreateBuffer ( *(cl_context*)(src.clCxt->getOpenCLContextPtr()), CL_MEM_READ_WRITE, buffersize, 0, &medstatus);
         openCLSafeCall( medstatus );
     }
-    cl_command_queue clq = (cl_command_queue)src.clCxt->oclCommandQueue();
+    cl_command_queue clq = *(cl_command_queue*)(src.clCxt->getOpenCLCommandQueuePtr());
     openCLSafeCall( clAmdFftEnqueueTransform( plHandle,
         is_inverse ? CLFFT_BACKWARD : CLFFT_FORWARD,
         1,
diff --git a/modules/ocl/src/filtering.cpp b/modules/ocl/src/filtering.cpp
index 284dc61632..caaf53d849 100644
--- a/modules/ocl/src/filtering.cpp
+++ b/modules/ocl/src/filtering.cpp
@@ -1430,7 +1430,7 @@ void cv::ocl::Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy,
 
 void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, double scale)
 {
-    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
+    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.type() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
         return;
diff --git a/modules/ocl/src/gemm.cpp b/modules/ocl/src/gemm.cpp
index ec03c2f932..687f26f632 100644
--- a/modules/ocl/src/gemm.cpp
+++ b/modules/ocl/src/gemm.cpp
@@ -134,7 +134,7 @@ void cv::ocl::gemm(const oclMat &src1, const oclMat &src2, double alpha,
     int offb    = src2.offset;
     int offc    = dst.offset;
 
-    cl_command_queue clq = (cl_command_queue)src1.clCxt->oclCommandQueue();
+    cl_command_queue clq = *(cl_command_queue*)src1.clCxt->getOpenCLCommandQueuePtr();
     switch(src1.type())
     {
     case CV_32FC1:
diff --git a/modules/ocl/src/gftt.cpp b/modules/ocl/src/gftt.cpp
index 37ebaafa38..29a96ae658 100644
--- a/modules/ocl/src/gftt.cpp
+++ b/modules/ocl/src/gftt.cpp
@@ -338,7 +338,7 @@ void cv::ocl::GoodFeaturesToTrackDetector_OCL::downloadPoints(const oclMat &poin
     CV_DbgAssert(points.type() == CV_32FC2);
     points_v.resize(points.cols);
     openCLSafeCall(clEnqueueReadBuffer(
-        *reinterpret_cast<cl_command_queue*>(getoclCommandQueue()),
+        *(cl_command_queue*)getClCommandQueuePtr(),
         reinterpret_cast<cl_mem>(points.data),
         CL_TRUE,
         0,
diff --git a/modules/ocl/src/haar.cpp b/modules/ocl/src/haar.cpp
index 212fd2c444..e3e73b3c3d 100644
--- a/modules/ocl/src/haar.cpp
+++ b/modules/ocl/src/haar.cpp
@@ -745,7 +745,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
     if( gimg.cols < minSize.width || gimg.rows < minSize.height )
         CV_Error(CV_StsError, "Image too small");
 
-    cl_command_queue qu = reinterpret_cast<cl_command_queue>(Context::getContext()->oclCommandQueue());
+    cl_command_queue qu = getClCommandQueue(Context::getContext());
     if( (flags & CV_HAAR_SCALE_IMAGE) )
     {
         CvSize winSize0 = cascade->orig_window_size;
@@ -788,7 +788,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
 
         size_t blocksize = 8;
         size_t localThreads[3] = { blocksize, blocksize , 1 };
-        size_t globalThreads[3] = { grp_per_CU *(gsum.clCxt->computeUnits()) *localThreads[0],
+        size_t globalThreads[3] = { grp_per_CU *(gsum.clCxt->getDeviceInfo().maxComputeUnits) *localThreads[0],
                                     localThreads[1], 1
                                   };
         int outputsz = 256 * globalThreads[0] / localThreads[0];
@@ -949,7 +949,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
         int grp_per_CU = 12;
         size_t blocksize = 8;
         size_t localThreads[3] = { blocksize, blocksize , 1 };
-        size_t globalThreads[3] = { grp_per_CU *gsum.clCxt->computeUnits() *localThreads[0],
+        size_t globalThreads[3] = { grp_per_CU *gsum.clCxt->getDeviceInfo().maxComputeUnits *localThreads[0],
                                     localThreads[1], 1 };
         int outputsz = 256 * globalThreads[0] / localThreads[0];
         int nodenum = (datasize - sizeof(GpuHidHaarClassifierCascade) -
@@ -1120,7 +1120,7 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
     int blocksize = 8;
     int grp_per_CU = 12;
     size_t localThreads[3] = { blocksize, blocksize, 1 };
-    size_t globalThreads[3] = { grp_per_CU * cv::ocl::Context::getContext()->computeUnits() *localThreads[0],
+    size_t globalThreads[3] = { grp_per_CU * cv::ocl::Context::getContext()->getDeviceInfo().maxComputeUnits *localThreads[0],
         localThreads[1],
         1 };
     int outputsz = 256 * globalThreads[0] / localThreads[0];
@@ -1148,7 +1148,7 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
     }
 
     int *candidate;
-    cl_command_queue qu = reinterpret_cast<cl_command_queue>(Context::getContext()->oclCommandQueue());
+    cl_command_queue qu = getClCommandQueue(Context::getContext());
     if( (flags & CV_HAAR_SCALE_IMAGE) )
     {
         int indexy = 0;
@@ -1340,7 +1340,7 @@ void cv::ocl::OclCascadeClassifierBuf::Init(const int rows, const int cols,
     GpuHidHaarStageClassifier    *stage;
     GpuHidHaarClassifier         *classifier;
     GpuHidHaarTreeNode           *node;
-    cl_command_queue qu = reinterpret_cast<cl_command_queue>(Context::getContext()->oclCommandQueue());
+    cl_command_queue qu = getClCommandQueue(Context::getContext());
     if( (flags & CV_HAAR_SCALE_IMAGE) )
     {
         gcascade   = (GpuHidHaarClassifierCascade *)(cascade->hid_cascade);
@@ -1505,7 +1505,7 @@ void cv::ocl::OclCascadeClassifierBuf::CreateFactorRelatedBufs(
     CvSize sz;
     CvSize winSize0 = oldCascade->orig_window_size;
     detect_piramid_info *scaleinfo;
-    cl_command_queue qu = reinterpret_cast<cl_command_queue>(Context::getContext()->oclCommandQueue());
+    cl_command_queue qu = getClCommandQueue(Context::getContext());
     if (flags & CV_HAAR_SCALE_IMAGE)
     {
         for(factor = 1.f;; factor *= scaleFactor)
diff --git a/modules/ocl/src/hog.cpp b/modules/ocl/src/hog.cpp
index 55872829a9..563172bc13 100644
--- a/modules/ocl/src/hog.cpp
+++ b/modules/ocl/src/hog.cpp
@@ -157,7 +157,7 @@ cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size blo
 
     effect_size = Size(0, 0);
 
-    if (queryDeviceInfo<IS_CPU_DEVICE, bool>())
+    if (isCpuDevice())
         hog_device_cpu = true;
     else
         hog_device_cpu = false;
@@ -1670,9 +1670,9 @@ void cv::ocl::device::hog::compute_hists(int nbins,
     else
     {
         cl_kernel kernel = openCLGetKernelFromSource(clCxt, &objdetect_hog, kernelName);
-        int wave_size = queryDeviceInfo<WAVEFRONT_SIZE, int>(kernel);
+        size_t wave_size = queryWaveFrontSize(kernel);
         char opt[32] = {0};
-        sprintf(opt, "-D WAVE_SIZE=%d", wave_size);
+        sprintf(opt, "-D WAVE_SIZE=%d", (int)wave_size);
         openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
             localThreads, args, -1, -1, opt);
     }
@@ -1734,9 +1734,9 @@ void cv::ocl::device::hog::normalize_hists(int nbins,
     else
     {
         cl_kernel kernel = openCLGetKernelFromSource(clCxt, &objdetect_hog, kernelName);
-        int wave_size = queryDeviceInfo<WAVEFRONT_SIZE, int>(kernel);
+        size_t wave_size = queryWaveFrontSize(kernel);
         char opt[32] = {0};
-        sprintf(opt, "-D WAVE_SIZE=%d", wave_size);
+        sprintf(opt, "-D WAVE_SIZE=%d", (int)wave_size);
         openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
                              localThreads, args, -1, -1, opt);
     }
@@ -1803,9 +1803,9 @@ void cv::ocl::device::hog::classify_hists(int win_height, int win_width,
     else
     {
         cl_kernel kernel = openCLGetKernelFromSource(clCxt, &objdetect_hog, kernelName);
-        int wave_size = queryDeviceInfo<WAVEFRONT_SIZE, int>(kernel);
+        size_t wave_size = queryWaveFrontSize(kernel);
         char opt[32] = {0};
-        sprintf(opt, "-D WAVE_SIZE=%d", wave_size);
+        sprintf(opt, "-D WAVE_SIZE=%d", (int)wave_size);
         openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
                              localThreads, args, -1, -1, opt);
     }
diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp
index 7d0d941dfa..0949605e15 100644
--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
@@ -289,7 +289,7 @@ namespace cv
                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.rows));
                 args.push_back( make_pair(sizeof(cl_int), (void *)&cols));
 
-                if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
+                if(src.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
                 {
                     args.push_back( make_pair(sizeof(cl_double4), (void *)&borderValue));
                 }
@@ -317,7 +317,7 @@ namespace cv
                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.cols));
                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.rows));
                 args.push_back( make_pair(sizeof(cl_int), (void *)&cols));
-                if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
+                if(src.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
                 {
                     args.push_back( make_pair(sizeof(cl_double4), (void *)&borderValue));
                 }
@@ -380,7 +380,7 @@ namespace cv
                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
-                if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
+                if(src.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
                 {
                     args.push_back( make_pair(sizeof(cl_double), (void *)&ifx_d));
                     args.push_back( make_pair(sizeof(cl_double), (void *)&ify_d));
@@ -802,12 +802,12 @@ namespace cv
                 string kernelName = "warpAffine" + s[interpolation];
 
 
-                if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
+                if(src.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
                 {
                     cl_int st;
-                    coeffs_cm = clCreateBuffer( (cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(F) * 2 * 3, NULL, &st );
+                    coeffs_cm = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, sizeof(F) * 2 * 3, NULL, &st );
                     openCLVerifyCall(st);
-                    openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(F) * 2 * 3, coeffs, 0, 0, 0));
+                    openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(), (cl_mem)coeffs_cm, 1, 0, sizeof(F) * 2 * 3, coeffs, 0, 0, 0));
                 }
                 else
                 {
@@ -817,8 +817,8 @@ namespace cv
                         {
                             float_coeffs[m][n] = coeffs[m][n];
                         }
-                        coeffs_cm = clCreateBuffer( (cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(float) * 2 * 3, NULL, &st );
-                        openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 2 * 3, float_coeffs, 0, 0, 0));
+                        coeffs_cm = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, sizeof(float) * 2 * 3, NULL, &st );
+                        openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(), (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 2 * 3, float_coeffs, 0, 0, 0));
 
                 }
                 //TODO: improve this kernel
@@ -872,12 +872,12 @@ namespace cv
                 string s[3] = {"NN", "Linear", "Cubic"};
                 string kernelName = "warpPerspective" + s[interpolation];
 
-                if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
+                if(src.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
                 {
                     cl_int st;
-                    coeffs_cm = clCreateBuffer((cl_context) clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(double) * 3 * 3, NULL, &st );
+                    coeffs_cm = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, sizeof(double) * 3 * 3, NULL, &st );
                     openCLVerifyCall(st);
-                    openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(double) * 3 * 3, coeffs, 0, 0, 0));
+                    openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(), (cl_mem)coeffs_cm, 1, 0, sizeof(double) * 3 * 3, coeffs, 0, 0, 0));
                 }
                 else
                 {
@@ -886,9 +886,9 @@ namespace cv
                         for(int n = 0; n < 3; n++)
                             float_coeffs[m][n] = coeffs[m][n];
 
-                    coeffs_cm = clCreateBuffer((cl_context) clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(float) * 3 * 3, NULL, &st );
+                    coeffs_cm = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, sizeof(float) * 3 * 3, NULL, &st );
                     openCLVerifyCall(st);
-                    openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 3 * 3, float_coeffs, 0, 0, 0));
+                    openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(), (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 3 * 3, float_coeffs, 0, 0, 0));
                 }
                 //TODO: improve this kernel
                 size_t blkSizeX = 16, blkSizeY = 16;
@@ -994,7 +994,7 @@ namespace cv
         void integral(const oclMat &src, oclMat &sum, oclMat &sqsum)
         {
             CV_Assert(src.type() == CV_8UC1);
-            if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+            if(!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
             {
                 CV_Error(CV_GpuNotSupported, "select device don't support double");
             }
@@ -1192,7 +1192,7 @@ namespace cv
         void cornerHarris_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize,
                           double k, int borderType)
         {
-            if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+            if(!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
             {
                 CV_Error(CV_GpuNotSupported, "select device don't support double");
             }
@@ -1211,7 +1211,7 @@ namespace cv
 
         void cornerMinEigenVal_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize, int borderType)
         {
-            if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+            if(!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
             {
                 CV_Error(CV_GpuNotSupported, "select device don't support double");
             }
@@ -1512,17 +1512,17 @@ namespace cv
                 String kernelName = "calcLut";
                 size_t localThreads[3]  = { 32, 8, 1 };
                 size_t globalThreads[3] = { tilesX * localThreads[0], tilesY * localThreads[1], 1 };
-                bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
+                bool is_cpu = isCpuDevice();
                 if (is_cpu)
                     openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1, (char*)" -D CPU");
                 else
                 {
                     cl_kernel kernel = openCLGetKernelFromSource(Context::getContext(), &imgproc_clahe, kernelName);
-                    int wave_size = queryDeviceInfo<WAVEFRONT_SIZE, int>(kernel);
+                    size_t wave_size = queryWaveFrontSize(kernel);
                     openCLSafeCall(clReleaseKernel(kernel));
 
                     static char opt[20] = {0};
-                    sprintf(opt, " -D WAVE_SIZE=%d", wave_size);
+                    sprintf(opt, " -D WAVE_SIZE=%d", (int)wave_size);
                     openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1, opt);
                 }
             }
diff --git a/modules/ocl/src/initialization.cpp b/modules/ocl/src/initialization.cpp
deleted file mode 100644
index c18984b078..0000000000
--- a/modules/ocl/src/initialization.cpp
+++ /dev/null
@@ -1,1090 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Guoping Long, longguoping@gmail.com
-//    Niko Li, newlife20080214@gmail.com
-//    Yao Wang, bitwangyaoyao@gmail.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include <iomanip>
-#include <fstream>
-#include "binarycaching.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-using namespace std;
-using std::cout;
-using std::endl;
-
-//#define PRINT_KERNEL_RUN_TIME
-#define RUN_TIMES 100
-#ifndef CL_MEM_USE_PERSISTENT_MEM_AMD
-#define CL_MEM_USE_PERSISTENT_MEM_AMD 0
-#endif
-//#define AMD_DOUBLE_DIFFER
-
-namespace cv
-{
-    namespace ocl
-    {
-        extern void fft_teardown();
-        extern void clBlasTeardown();
-        /*
-         * The binary caching system to eliminate redundant program source compilation.
-         * Strictly, this is not a cache because we do not implement evictions right now.
-         * We shall add such features to trade-off memory consumption and performance when necessary.
-         */
-        auto_ptr<ProgramCache> ProgramCache::programCache;
-        ProgramCache *programCache = NULL;
-        DevMemType gDeviceMemType = DEVICE_MEM_DEFAULT;
-        DevMemRW gDeviceMemRW = DEVICE_MEM_R_W;
-        int gDevMemTypeValueMap[5] = {0,
-                                      CL_MEM_ALLOC_HOST_PTR,
-                                      CL_MEM_USE_HOST_PTR,
-                                      CL_MEM_COPY_HOST_PTR,
-                                      CL_MEM_USE_PERSISTENT_MEM_AMD};
-        int gDevMemRWValueMap[3] = {CL_MEM_READ_WRITE, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY};
-
-        ProgramCache::ProgramCache()
-        {
-            codeCache.clear();
-            cacheSize = 0;
-        }
-
-        ProgramCache::~ProgramCache()
-        {
-            releaseProgram();
-        }
-
-        cl_program ProgramCache::progLookup(string srcsign)
-        {
-            map<string, cl_program>::iterator iter;
-            iter = codeCache.find(srcsign);
-            if(iter != codeCache.end())
-                return iter->second;
-            else
-                return NULL;
-        }
-
-        void ProgramCache::addProgram(string srcsign , cl_program program)
-        {
-            if(!progLookup(srcsign))
-            {
-                codeCache.insert(map<string, cl_program>::value_type(srcsign, program));
-            }
-        }
-
-        void ProgramCache::releaseProgram()
-        {
-            map<string, cl_program>::iterator iter;
-            for(iter = codeCache.begin(); iter != codeCache.end(); iter++)
-            {
-                openCLSafeCall(clReleaseProgram(iter->second));
-            }
-            codeCache.clear();
-            cacheSize = 0;
-        }
-        struct Info::Impl
-        {
-            cl_platform_id oclplatform;
-            std::vector<cl_device_id> devices;
-            std::vector<std::string> devName;
-            std::string clVersion;
-
-            cl_context oclcontext;
-            cl_command_queue clCmdQueue;
-            int devnum;
-            size_t maxWorkGroupSize;
-            cl_uint maxDimensions; // == maxWorkItemSizes.size()
-            std::vector<size_t> maxWorkItemSizes;
-            cl_uint maxComputeUnits;
-            char extra_options[512];
-            int  double_support;
-            int unified_memory; //1 means integrated GPU, otherwise this value is 0
-            int refcounter;
-
-            Impl();
-
-            void setDevice(void *ctx, void *q, int devnum);
-
-            void release()
-            {
-                if(1 == CV_XADD(&refcounter, -1))
-                {
-                    releaseResources();
-                    delete this;
-                }
-            }
-
-            Impl* copy()
-            {
-                CV_XADD(&refcounter, 1);
-                return this;
-            }
-
-        private:
-            Impl(const Impl&);
-            Impl& operator=(const Impl&);
-            void releaseResources();
-        };
-
-        // global variables to hold binary cache properties
-        static int enable_disk_cache =
-#ifdef _DEBUG
-            false;
-#else
-            true;
-#endif
-        static int update_disk_cache = false;
-        static String binpath = "";
-
-        Info::Impl::Impl()
-            :oclplatform(0),
-            oclcontext(0),
-            clCmdQueue(0),
-            devnum(-1),
-            maxWorkGroupSize(0),
-            maxDimensions(0),
-            maxComputeUnits(0),
-            double_support(0),
-            unified_memory(0),
-            refcounter(1)
-        {
-            memset(extra_options, 0, 512);
-        }
-
-        void Info::Impl::releaseResources()
-        {
-            devnum = -1;
-
-            if(clCmdQueue)
-            {
-                //temporarily disable command queue release as it causes program hang at exit
-                //openCLSafeCall(clReleaseCommandQueue(clCmdQueue));
-                clCmdQueue = 0;
-            }
-
-            if(oclcontext)
-            {
-                openCLSafeCall(clReleaseContext(oclcontext));
-                oclcontext = 0;
-            }
-        }
-
-        void Info::Impl::setDevice(void *ctx, void *q, int dnum)
-        {
-            if((ctx && q) || devnum != dnum)
-                releaseResources();
-
-            CV_Assert(dnum >= 0 && dnum < (int)devices.size());
-            devnum = dnum;
-            if(ctx && q)
-            {
-                oclcontext = (cl_context)ctx;
-                clCmdQueue = (cl_command_queue)q;
-                clRetainContext(oclcontext);
-                clRetainCommandQueue(clCmdQueue);
-            }
-            else
-            {
-                cl_int status = 0;
-                cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(oclplatform), 0 };
-                oclcontext = clCreateContext(cps, 1, &devices[devnum], 0, 0, &status);
-                openCLVerifyCall(status);
-                clCmdQueue = clCreateCommandQueue(oclcontext, devices[devnum], CL_QUEUE_PROFILING_ENABLE, &status);
-                openCLVerifyCall(status);
-            }
-
-            openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&maxWorkGroupSize, 0));
-            openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), (void *)&maxDimensions, 0));
-            maxWorkItemSizes.resize(maxDimensions);
-            openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*maxDimensions, (void *)&maxWorkItemSizes[0], 0));
-            openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), (void *)&maxComputeUnits, 0));
-
-            cl_bool unfymem = false;
-            openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), (void *)&unfymem, 0));
-            unified_memory = unfymem ? 1 : 0;
-
-            //initialize extra options for compilation. Currently only fp64 is included.
-            //Assume 4KB is enough to store all possible extensions.
-            const int EXT_LEN = 4096 + 1 ;
-            char extends_set[EXT_LEN];
-            size_t extends_size;
-            openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_EXTENSIONS, EXT_LEN, (void *)extends_set, &extends_size));
-            extends_set[EXT_LEN - 1] = 0;
-            size_t fp64_khr = std::string(extends_set).find("cl_khr_fp64");
-
-            if(fp64_khr != std::string::npos)
-            {
-                sprintf(extra_options, "-D DOUBLE_SUPPORT");
-                double_support = 1;
-            }
-            else
-            {
-                memset(extra_options, 0, 512);
-                double_support = 0;
-            }
-        }
-
-        ////////////////////////Common OpenCL specific calls///////////////
-        int getDevMemType(DevMemRW& rw_type, DevMemType& mem_type)
-        {
-            rw_type = gDeviceMemRW;
-            mem_type = gDeviceMemType;
-            return Context::getContext()->impl->unified_memory;
-        }
-
-        int setDevMemType(DevMemRW rw_type, DevMemType mem_type)
-        {
-            if( (mem_type == DEVICE_MEM_PM && Context::getContext()->impl->unified_memory == 0) ||
-                 mem_type == DEVICE_MEM_UHP ||
-                 mem_type == DEVICE_MEM_CHP )
-                return -1;
-            gDeviceMemRW = rw_type;
-            gDeviceMemType = mem_type;
-            return 0;
-        }
-
-        int getDevice(std::vector<Info> &oclinfo, int devicetype)
-        {
-            //TODO: cache oclinfo vector
-            oclinfo.clear();
-
-            switch(devicetype)
-            {
-            case CVCL_DEVICE_TYPE_DEFAULT:
-            case CVCL_DEVICE_TYPE_CPU:
-            case CVCL_DEVICE_TYPE_GPU:
-            case CVCL_DEVICE_TYPE_ACCELERATOR:
-            case CVCL_DEVICE_TYPE_ALL:
-                break;
-            default:
-                return 0;
-            }
-
-            // Platform info
-            cl_uint numPlatforms;
-            openCLSafeCall(clGetPlatformIDs(0, 0, &numPlatforms));
-            if(numPlatforms < 1) return 0;
-
-            std::vector<cl_platform_id> platforms(numPlatforms);
-            openCLSafeCall(clGetPlatformIDs(numPlatforms, &platforms[0], 0));
-
-            char deviceName[256];
-            int devcienums = 0;
-            char clVersion[256];
-            for (unsigned i = 0; i < numPlatforms; ++i)
-            {
-                cl_uint numsdev = 0;
-                cl_int status = clGetDeviceIDs(platforms[i], devicetype, 0, NULL, &numsdev);
-                if(status != CL_DEVICE_NOT_FOUND)
-                    openCLVerifyCall(status);
-
-                if(numsdev > 0)
-                {
-                    devcienums += numsdev;
-                    std::vector<cl_device_id> devices(numsdev);
-                    openCLSafeCall(clGetDeviceIDs(platforms[i], devicetype, numsdev, &devices[0], 0));
-
-                    Info ocltmpinfo;
-                    ocltmpinfo.impl->oclplatform = platforms[i];
-                    openCLSafeCall(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, sizeof(clVersion), clVersion, NULL));
-                    ocltmpinfo.impl->clVersion = clVersion;
-                    for(unsigned j = 0; j < numsdev; ++j)
-                    {
-                        ocltmpinfo.impl->devices.push_back(devices[j]);
-                        openCLSafeCall(clGetDeviceInfo(devices[j], CL_DEVICE_NAME, sizeof(deviceName), deviceName, 0));
-                        ocltmpinfo.impl->devName.push_back(deviceName);
-                        ocltmpinfo.DeviceName.push_back(deviceName);
-                    }
-                    oclinfo.push_back(ocltmpinfo);
-                }
-            }
-            if(devcienums > 0)
-            {
-                setDevice(oclinfo[0]);
-            }
-            return devcienums;
-        }
-
-        void setDevice(Info &oclinfo, int devnum)
-        {
-            oclinfo.impl->setDevice(0, 0, devnum);
-            Context::setContext(oclinfo);
-        }
-
-        void setDeviceEx(Info &oclinfo, void *ctx, void *q, int devnum)
-        {
-            oclinfo.impl->setDevice(ctx, q, devnum);
-            Context::setContext(oclinfo);
-         }
-
-        void *getoclContext()
-        {
-            return &(Context::getContext()->impl->oclcontext);
-        }
-
-        void *getoclCommandQueue()
-        {
-            return &(Context::getContext()->impl->clCmdQueue);
-        }
-
-        void finish()
-        {
-            clFinish(Context::getContext()->impl->clCmdQueue);
-        }
-
-        //template specializations of queryDeviceInfo
-        template<>
-        bool queryDeviceInfo<IS_CPU_DEVICE, bool>(cl_kernel)
-        {
-            Info::Impl* impl = Context::getContext()->impl;
-            cl_device_type devicetype;
-            openCLSafeCall(clGetDeviceInfo(impl->devices[impl->devnum],
-                CL_DEVICE_TYPE, sizeof(cl_device_type),
-                &devicetype, NULL));
-            return (devicetype == CVCL_DEVICE_TYPE_CPU);
-        }
-
-        template<typename _ty>
-        static _ty queryWavesize(cl_kernel kernel)
-        {
-            size_t info = 0;
-            Info::Impl* impl = Context::getContext()->impl;
-            bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
-            if(is_cpu)
-            {
-                return 1;
-            }
-            CV_Assert(kernel != NULL);
-            openCLSafeCall(clGetKernelWorkGroupInfo(kernel, impl->devices[impl->devnum],
-                CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &info, NULL));
-            return static_cast<_ty>(info);
-        }
-
-        template<>
-        size_t queryDeviceInfo<WAVEFRONT_SIZE, size_t>(cl_kernel kernel)
-        {
-            return queryWavesize<size_t>(kernel);
-        }
-        template<>
-        int queryDeviceInfo<WAVEFRONT_SIZE, int>(cl_kernel kernel)
-        {
-            return queryWavesize<int>(kernel);
-        }
-
-        void openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size)
-        {
-            cl_int status;
-            status = clEnqueueReadBuffer(clCxt->impl->clCmdQueue, dst_buffer, CL_TRUE, 0,
-                                         size, host_buffer, 0, NULL, NULL);
-            openCLVerifyCall(status);
-        }
-
-        cl_mem openCLCreateBuffer(Context *clCxt, size_t flag , size_t size)
-        {
-            cl_int status;
-            cl_mem buffer = clCreateBuffer(clCxt->impl->oclcontext, (cl_mem_flags)flag, size, NULL, &status);
-            openCLVerifyCall(status);
-            return buffer;
-        }
-
-        void openCLMallocPitch(Context *clCxt, void **dev_ptr, size_t *pitch,
-                               size_t widthInBytes, size_t height)
-        {
-            openCLMallocPitchEx(clCxt, dev_ptr, pitch, widthInBytes, height, gDeviceMemRW, gDeviceMemType);
-        }
-
-        void openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch,
-                               size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type)
-        {
-            cl_int status;
-            *dev_ptr = clCreateBuffer(clCxt->impl->oclcontext, gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
-                                      widthInBytes * height, 0, &status);
-            openCLVerifyCall(status);
-            *pitch = widthInBytes;
-        }
-
-        void openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
-                            const void *src, size_t spitch,
-                            size_t width, size_t height, openCLMemcpyKind kind, int channels)
-        {
-            size_t buffer_origin[3] = {0, 0, 0};
-            size_t host_origin[3] = {0, 0, 0};
-            size_t region[3] = {width, height, 1};
-            if(kind == clMemcpyHostToDevice)
-            {
-                if(dpitch == width || channels == 3 || height == 1)
-                {
-                    openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, (cl_mem)dst, CL_TRUE,
-                                                        0, width * height, src, 0, NULL, NULL));
-                }
-                else
-                {
-                    openCLSafeCall(clEnqueueWriteBufferRect(clCxt->impl->clCmdQueue, (cl_mem)dst, CL_TRUE,
-                                                            buffer_origin, host_origin, region, dpitch, 0, spitch, 0, src, 0, 0, 0));
-                }
-            }
-            else if(kind == clMemcpyDeviceToHost)
-            {
-                if(spitch == width || channels == 3 || height == 1)
-                {
-                    openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)src, CL_TRUE,
-                                                       0, width * height, dst, 0, NULL, NULL));
-                }
-                else
-                {
-                    openCLSafeCall(clEnqueueReadBufferRect(clCxt->impl->clCmdQueue, (cl_mem)src, CL_TRUE,
-                                                           buffer_origin, host_origin, region, spitch, 0, dpitch, 0, dst, 0, 0, 0));
-                }
-            }
-        }
-
-        void openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset,
-                                const void *src, size_t spitch,
-                                size_t width, size_t height, int src_offset)
-        {
-            size_t src_origin[3] = {src_offset % spitch, src_offset / spitch, 0};
-            size_t dst_origin[3] = {dst_offset % dpitch, dst_offset / dpitch, 0};
-            size_t region[3] = {width, height, 1};
-
-            openCLSafeCall(clEnqueueCopyBufferRect(clCxt->impl->clCmdQueue, (cl_mem)src, (cl_mem)dst, src_origin, dst_origin,
-                                                   region, spitch, 0, dpitch, 0, 0, 0, 0));
-        }
-
-        void openCLFree(void *devPtr)
-        {
-            openCLSafeCall(clReleaseMemObject((cl_mem)devPtr));
-        }
-        cl_kernel openCLGetKernelFromSource(const Context *clCxt, const char **source, string kernelName)
-        {
-            return openCLGetKernelFromSource(clCxt, source, kernelName, NULL);
-        }
-
-        void setBinaryDiskCache(int mode, String path)
-        {
-            if(mode == CACHE_NONE)
-            {
-                update_disk_cache = 0;
-                enable_disk_cache = 0;
-                return;
-            }
-            update_disk_cache |= (mode & CACHE_UPDATE) == CACHE_UPDATE;
-            enable_disk_cache |=
-#ifdef _DEBUG
-                (mode & CACHE_DEBUG)   == CACHE_DEBUG;
-#else
-                (mode & CACHE_RELEASE) == CACHE_RELEASE;
-#endif
-            if(enable_disk_cache && !path.empty())
-            {
-                binpath = path;
-            }
-        }
-
-        void setBinpath(const char *path)
-        {
-            binpath = path;
-        }
-
-        int savetofile(const Context*,  cl_program &program, const char *fileName)
-        {
-            size_t binarySize;
-            openCLSafeCall(clGetProgramInfo(program,
-                                    CL_PROGRAM_BINARY_SIZES,
-                                    sizeof(size_t),
-                                    &binarySize, NULL));
-            char* binary = (char*)malloc(binarySize);
-            if(binary == NULL)
-            {
-                CV_Error(CV_StsNoMem, "Failed to allocate host memory.");
-            }
-            openCLSafeCall(clGetProgramInfo(program,
-                                    CL_PROGRAM_BINARIES,
-                                    sizeof(char *),
-                                    &binary,
-                                    NULL));
-
-            FILE *fp = fopen(fileName, "wb+");
-            if(fp != NULL)
-            {
-                fwrite(binary, binarySize, 1, fp);
-                free(binary);
-                fclose(fp);
-            }
-            return 1;
-        }
-
-        cl_kernel openCLGetKernelFromSource(const Context *clCxt, const char **source, string kernelName,
-                                            const char *build_options)
-        {
-            cl_kernel kernel;
-            cl_program program ;
-            cl_int status = 0;
-            stringstream src_sign;
-            string srcsign;
-            string filename;
-            CV_Assert(programCache != NULL);
-
-            if(NULL != build_options)
-            {
-                src_sign << (int64)(*source) << clCxt->impl->oclcontext << "_" << build_options;
-            }
-            else
-            {
-                src_sign << (int64)(*source) << clCxt->impl->oclcontext;
-            }
-            srcsign = src_sign.str();
-
-            program = NULL;
-            program = programCache->progLookup(srcsign);
-
-            if(!program)
-            {
-                //config build programs
-                char all_build_options[1024];
-                memset(all_build_options, 0, 1024);
-                char zeromem[512] = {0};
-                if(0 != memcmp(clCxt -> impl->extra_options, zeromem, 512))
-                    strcat(all_build_options, clCxt -> impl->extra_options);
-                strcat(all_build_options, " ");
-                if(build_options != NULL)
-                    strcat(all_build_options, build_options);
-                if(all_build_options != NULL)
-                {
-                    filename = binpath + kernelName + "_" + clCxt->impl->devName[clCxt->impl->devnum] + all_build_options + ".clb";
-                }
-                else
-                {
-                    filename = binpath + kernelName + "_" + clCxt->impl->devName[clCxt->impl->devnum] + ".clb";
-                }
-
-                FILE *fp = enable_disk_cache ? fopen(filename.c_str(), "rb") : NULL;
-                if(fp == NULL || update_disk_cache)
-                {
-                    if(fp != NULL)
-                        fclose(fp);
-
-                    program = clCreateProgramWithSource(
-                                  clCxt->impl->oclcontext, 1, source, NULL, &status);
-                    openCLVerifyCall(status);
-                    status = clBuildProgram(program, 1, &(clCxt->impl->devices[clCxt->impl->devnum]), all_build_options, NULL, NULL);
-                    if(status == CL_SUCCESS && enable_disk_cache)
-                        savetofile(clCxt, program, filename.c_str());
-                }
-                else
-                {
-                    fseek(fp, 0, SEEK_END);
-                    size_t binarySize = ftell(fp);
-                    fseek(fp, 0, SEEK_SET);
-                    char *binary = new char[binarySize];
-                    CV_Assert(1 == fread(binary, binarySize, 1, fp));
-                    fclose(fp);
-                    cl_int status = 0;
-                    program = clCreateProgramWithBinary(clCxt->impl->oclcontext,
-                                                        1,
-                                                        &(clCxt->impl->devices[clCxt->impl->devnum]),
-                                                        (const size_t *)&binarySize,
-                                                        (const unsigned char **)&binary,
-                                                        NULL,
-                                                        &status);
-                    openCLVerifyCall(status);
-                    status = clBuildProgram(program, 1, &(clCxt->impl->devices[clCxt->impl->devnum]), all_build_options, NULL, NULL);
-                    delete[] binary;
-                }
-
-                if(status != CL_SUCCESS)
-                {
-                    if(status == CL_BUILD_PROGRAM_FAILURE)
-                    {
-                        cl_int logStatus;
-                        char *buildLog = NULL;
-                        size_t buildLogSize = 0;
-                        logStatus = clGetProgramBuildInfo(program,
-                                                          clCxt->impl->devices[clCxt->impl->devnum], CL_PROGRAM_BUILD_LOG, buildLogSize,
-                                                          buildLog, &buildLogSize);
-                        if(logStatus != CL_SUCCESS)
-                            cout << "Failed to build the program and get the build info." << endl;
-                        buildLog = new char[buildLogSize];
-                        CV_DbgAssert(!!buildLog);
-                        memset(buildLog, 0, buildLogSize);
-                        openCLSafeCall(clGetProgramBuildInfo(program, clCxt->impl->devices[clCxt->impl->devnum],
-                                                             CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL));
-                        cout << "\n\t\t\tBUILD LOG\n";
-                        cout << buildLog << endl;
-                        delete [] buildLog;
-                    }
-                    openCLVerifyCall(status);
-                }
-                //Cache the binary for future use if build_options is null
-                if( (programCache->cacheSize += 1) < programCache->MAX_PROG_CACHE_SIZE)
-                    programCache->addProgram(srcsign, program);
-                else
-                    cout << "Warning: code cache has been full.\n";
-            }
-            kernel = clCreateKernel(program, kernelName.c_str(), &status);
-            openCLVerifyCall(status);
-            return kernel;
-        }
-
-        void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads)
-        {
-            size_t kernelWorkGroupSize;
-            openCLSafeCall(clGetKernelWorkGroupInfo(kernel, clCxt->impl->devices[clCxt->impl->devnum],
-                                                    CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
-            CV_Assert( localThreads[0] <= clCxt->impl->maxWorkItemSizes[0] );
-            CV_Assert( localThreads[1] <= clCxt->impl->maxWorkItemSizes[1] );
-            CV_Assert( localThreads[2] <= clCxt->impl->maxWorkItemSizes[2] );
-            CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= kernelWorkGroupSize );
-            CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= clCxt->impl->maxWorkGroupSize );
-        }
-
-        static inline size_t roundUp(size_t sz, size_t n)
-        {
-            // we don't assume that n is a power of 2 (see alignSize)
-            // equal to divUp(sz, n) * n
-            size_t t = sz + n - 1;
-            size_t rem = t % n;
-            size_t result = t - rem;
-            return result;
-        }
-
-#ifdef PRINT_KERNEL_RUN_TIME
-        static double total_execute_time = 0;
-        static double total_kernel_time = 0;
-#endif
-        void openCLExecuteKernel_(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
-                                  size_t localThreads[3],  vector< pair<size_t, const void *> > &args, int channels,
-                                  int depth, const char *build_options)
-        {
-            //construct kernel name
-            //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number
-            //for exmaple split_C2_D2, represent the split kernel with channels =2 and dataType Depth = 2(Data type is char)
-            stringstream idxStr;
-            if(channels != -1)
-                idxStr << "_C" << channels;
-            if(depth != -1)
-                idxStr << "_D" << depth;
-            kernelName += idxStr.str();
-
-            cl_kernel kernel;
-            kernel = openCLGetKernelFromSource(clCxt, source, kernelName, build_options);
-
-            if ( localThreads != NULL)
-            {
-                globalThreads[0] = roundUp(globalThreads[0], localThreads[0]);
-                globalThreads[1] = roundUp(globalThreads[1], localThreads[1]);
-                globalThreads[2] = roundUp(globalThreads[2], localThreads[2]);
-
-                cv::ocl::openCLVerifyKernel(clCxt, kernel, localThreads);
-            }
-            for(size_t i = 0; i < args.size(); i ++)
-                openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
-
-#ifndef PRINT_KERNEL_RUN_TIME
-            openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
-                                                  localThreads, 0, NULL, NULL));
-#else
-            cl_event event = NULL;
-            openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
-                                                  localThreads, 0, NULL, &event));
-
-            cl_ulong start_time, end_time, queue_time;
-            double execute_time = 0;
-            double total_time   = 0;
-
-            openCLSafeCall(clWaitForEvents(1, &event));
-            openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
-                                                   sizeof(cl_ulong), &start_time, 0));
-
-            openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
-                                                   sizeof(cl_ulong), &end_time, 0));
-
-            openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED,
-                                                   sizeof(cl_ulong), &queue_time, 0));
-
-            execute_time = (double)(end_time - start_time) / (1000 * 1000);
-            total_time = (double)(end_time - queue_time) / (1000 * 1000);
-
-            total_execute_time += execute_time;
-            total_kernel_time += total_time;
-            clReleaseEvent(event);
-#endif
-
-            clFlush(clCxt->impl->clCmdQueue);
-            openCLSafeCall(clReleaseKernel(kernel));
-        }
-
-        void openCLExecuteKernel(Context *clCxt , const char **source, string kernelName,
-                                 size_t globalThreads[3], size_t localThreads[3],
-                                 vector< pair<size_t, const void *> > &args, int channels, int depth)
-        {
-            openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args,
-                                channels, depth, NULL);
-        }
-        void openCLExecuteKernel(Context *clCxt , const char **source, string kernelName,
-                                 size_t globalThreads[3], size_t localThreads[3],
-                                 vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options)
-
-        {
-#ifndef PRINT_KERNEL_RUN_TIME
-            openCLExecuteKernel_(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth,
-                                 build_options);
-#else
-            string data_type[] = { "uchar", "char", "ushort", "short", "int", "float", "double"};
-            cout << endl;
-            cout << "Function Name: " << kernelName;
-            if(depth >= 0)
-                cout << " |data type: " << data_type[depth];
-            cout << " |channels: " << channels;
-            cout << " |Time Unit: " << "ms" << endl;
-
-            total_execute_time = 0;
-            total_kernel_time = 0;
-            cout << "-------------------------------------" << endl;
-
-            cout << setiosflags(ios::left) << setw(15) << "excute time";
-            cout << setiosflags(ios::left) << setw(15) << "lauch time";
-            cout << setiosflags(ios::left) << setw(15) << "kernel time" << endl;
-            int i = 0;
-            for(i = 0; i < RUN_TIMES; i++)
-                openCLExecuteKernel_(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth,
-                                     build_options);
-
-            cout << "average kernel excute time: " << total_execute_time / RUN_TIMES << endl; // "ms" << endl;
-            cout << "average kernel total time:  " << total_kernel_time / RUN_TIMES << endl; // "ms" << endl;
-#endif
-        }
-
-       double openCLExecuteKernelInterop(Context *clCxt , const char **source, string kernelName,
-                                 size_t globalThreads[3], size_t localThreads[3],
-                                 vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options,
-                                 bool finish, bool measureKernelTime, bool cleanUp)
-
-        {
-            //construct kernel name
-            //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number
-            //for exmaple split_C2_D2, represent the split kernel with channels =2 and dataType Depth = 2(Data type is char)
-            stringstream idxStr;
-            if(channels != -1)
-                idxStr << "_C" << channels;
-            if(depth != -1)
-                idxStr << "_D" << depth;
-            kernelName += idxStr.str();
-
-            cl_kernel kernel;
-            kernel = openCLGetKernelFromSource(clCxt, source, kernelName, build_options);
-
-            double kernelTime = 0.0;
-
-            if( globalThreads != NULL)
-            {
-                if ( localThreads != NULL)
-                {
-                    globalThreads[0] = divUp(globalThreads[0], localThreads[0]) * localThreads[0];
-                    globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1];
-                    globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2];
-
-                    //size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2];
-                    cv::ocl::openCLVerifyKernel(clCxt, kernel, localThreads);
-                }
-                for(size_t i = 0; i < args.size(); i ++)
-                    openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
-
-                if(measureKernelTime == false)
-                {
-                    openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
-                                    localThreads, 0, NULL, NULL));
-                }
-                else
-                {
-                    cl_event event = NULL;
-                    openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
-                                    localThreads, 0, NULL, &event));
-
-                    cl_ulong end_time, queue_time;
-
-                    openCLSafeCall(clWaitForEvents(1, &event));
-
-                    openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
-                                    sizeof(cl_ulong), &end_time, 0));
-
-                    openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED,
-                                    sizeof(cl_ulong), &queue_time, 0));
-
-                    kernelTime = (double)(end_time - queue_time) / (1000 * 1000);
-
-                    clReleaseEvent(event);
-                }
-            }
-
-            if(finish)
-            {
-                clFinish(clCxt->impl->clCmdQueue);
-            }
-
-            if(cleanUp)
-            {
-                openCLSafeCall(clReleaseKernel(kernel));
-            }
-
-            return kernelTime;
-        }
-
-        // Converts the contents of a file into a string
-        static int convertToString(const char *filename, std::string& s)
-        {
-            size_t size;
-            char*  str;
-
-            std::fstream f(filename, (std::fstream::in | std::fstream::binary));
-            if(f.is_open())
-            {
-                size_t fileSize;
-                f.seekg(0, std::fstream::end);
-                size = fileSize = (size_t)f.tellg();
-                f.seekg(0, std::fstream::beg);
-
-                str = new char[size+1];
-                if(!str)
-                {
-                    f.close();
-                    return -1;
-                }
-
-                f.read(str, fileSize);
-                f.close();
-                str[size] = '\0';
-
-                s = str;
-                delete[] str;
-                return 0;
-            }
-            printf("Error: Failed to open file %s\n", filename);
-            return -1;
-        }
-
-        double openCLExecuteKernelInterop(Context *clCxt , const char **fileName, const int numFiles, string kernelName,
-                                 size_t globalThreads[3], size_t localThreads[3],
-                                 vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options,
-                                 bool finish, bool measureKernelTime, bool cleanUp)
-
-        {
-            std::vector<std::string> fsource;
-            for (int i = 0 ; i < numFiles ; i++)
-            {
-                std::string str;
-                if (convertToString(fileName[i], str) >= 0)
-                    fsource.push_back(str);
-            }
-            const char **source = new const char *[numFiles];
-            for (int i = 0 ; i < numFiles ; i++)
-                source[i] = fsource[i].c_str();
-            double kernelTime = openCLExecuteKernelInterop(clCxt ,source, kernelName, globalThreads, localThreads,
-                                 args, channels, depth, build_options, finish, measureKernelTime, cleanUp);
-            fsource.clear();
-            delete []source;
-            return kernelTime;
-        }
-
-        cl_mem load_constant(cl_context context, cl_command_queue command_queue, const void *value,
-                             const size_t size)
-        {
-            int status;
-            cl_mem con_struct;
-
-            con_struct = clCreateBuffer(context, CL_MEM_READ_ONLY, size, NULL, &status);
-            openCLSafeCall(status);
-
-            openCLSafeCall(clEnqueueWriteBuffer(command_queue, con_struct, 1, 0, size,
-                                                value, 0, 0, 0));
-
-            return con_struct;
-
-        }
-
-        /////////////////////////////OpenCL initialization/////////////////
-        auto_ptr<Context> Context::clCxt;
-        int Context::val = 0;
-        static Mutex cs;
-        static volatile int context_tear_down = 0;
-
-        bool initialized()
-        {
-            return *((volatile int*)&Context::val) != 0 &&
-                Context::clCxt->impl->clCmdQueue != NULL&&
-                Context::clCxt->impl->oclcontext != NULL;
-        }
-
-        Context* Context::getContext()
-        {
-            if(*((volatile int*)&val) != 1)
-            {
-                AutoLock al(cs);
-                if(*((volatile int*)&val) != 1)
-                {
-                    if (context_tear_down)
-                        return clCxt.get();
-                    if( 0 == clCxt.get())
-                        clCxt.reset(new Context);
-                    std::vector<Info> oclinfo;
-                    CV_Assert(getDevice(oclinfo, CVCL_DEVICE_TYPE_ALL) > 0);
-
-                    *((volatile int*)&val) = 1;
-                }
-            }
-            return clCxt.get();
-        }
-
-        void Context::setContext(Info &oclinfo)
-        {
-            AutoLock guard(cs);
-            if(*((volatile int*)&val) != 1)
-            {
-                if( 0 == clCxt.get())
-                    clCxt.reset(new Context);
-
-                clCxt.get()->impl = oclinfo.impl->copy();
-
-                *((volatile int*)&val) = 1;
-            }
-            else
-            {
-                clCxt.get()->impl->release();
-                clCxt.get()->impl = oclinfo.impl->copy();
-            }
-        }
-
-        Context::Context()
-        {
-            impl = 0;
-            programCache = ProgramCache::getProgramCache();
-        }
-
-        Context::~Context()
-        {
-            release();
-        }
-
-        void Context::release()
-        {
-            if (impl)
-                impl->release();
-            programCache->releaseProgram();
-        }
-
-        bool Context::supportsFeature(int ftype) const
-        {
-            switch(ftype)
-            {
-            case CL_DOUBLE:
-                return impl->double_support == 1;
-            case CL_UNIFIED_MEM:
-                return impl->unified_memory == 1;
-            case CL_VER_1_2:
-                return impl->clVersion.find("OpenCL 1.2") != string::npos;
-            default:
-                return false;
-            }
-        }
-
-        size_t Context::computeUnits() const
-        {
-            return impl->maxComputeUnits;
-        }
-
-        unsigned long queryLocalMemInfo()
-        {
-            Info::Impl* impl = Context::getContext()->impl;
-            cl_ulong local_memory_size = 0;
-            clGetDeviceInfo(impl->devices[impl->devnum], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), (void*)&local_memory_size, 0);
-            return local_memory_size;
-        }
-
-        void* Context::oclContext()
-        {
-            return impl->oclcontext;
-        }
-
-        void* Context::oclCommandQueue()
-        {
-            return impl->clCmdQueue;
-        }
-
-        Info::Info()
-        {
-            impl = new Impl;
-        }
-
-        void Info::release()
-        {
-            fft_teardown();
-            clBlasTeardown();
-            impl->release();
-            impl = new Impl;
-            DeviceName.clear();
-        }
-
-        Info::~Info()
-        {
-            fft_teardown();
-            clBlasTeardown();
-            impl->release();
-        }
-
-        Info &Info::operator = (const Info &m)
-        {
-            impl->release();
-            impl = m.impl->copy();
-            DeviceName = m.DeviceName;
-            return *this;
-        }
-
-        Info::Info(const Info &m)
-        {
-            impl = m.impl->copy();
-            DeviceName = m.DeviceName;
-        }
-    }//namespace ocl
-
-}//namespace cv
diff --git a/modules/ocl/src/knearest.cpp b/modules/ocl/src/knearest.cpp
index fd9f2fed57..02dc72c4ea 100644
--- a/modules/ocl/src/knearest.cpp
+++ b/modules/ocl/src/knearest.cpp
@@ -44,17 +44,11 @@
 //M*/
 
 #include "precomp.hpp"
+#include "opencl_kernels.hpp"
+
 using namespace cv;
 using namespace cv::ocl;
 
-namespace cv
-{
-    namespace ocl
-    {
-        extern const char* knearest;//knearest
-    }
-}
-
 KNearestNeighbour::KNearestNeighbour()
 {
     clear();
@@ -112,7 +106,7 @@ void KNearestNeighbour::find_nearest(const oclMat& samples, int k, oclMat& lable
     k1 = MIN( k1, k );
 
     String kernel_name = "knn_find_nearest";
-    cl_ulong local_memory_size = queryLocalMemInfo();
+    cl_ulong local_memory_size = (cl_ulong)Context::getContext()->getDeviceInfo().localMemorySize;
     int nThreads = local_memory_size / (2 * k * 4);
     if(nThreads >= 256)
         nThreads = 256;
@@ -122,7 +116,7 @@ void KNearestNeighbour::find_nearest(const oclMat& samples, int k, oclMat& lable
     size_t global_thread[] = {1, samples.rows, 1};
 
     char build_option[50];
-    if(!Context::getContext()->supportsFeature(Context::CL_DOUBLE))
+    if(!Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
     {
         sprintf(build_option, " ");
     }else
diff --git a/modules/ocl/src/matrix_operations.cpp b/modules/ocl/src/matrix_operations.cpp
index 3ae14eb48d..d247a14794 100644
--- a/modules/ocl/src/matrix_operations.cpp
+++ b/modules/ocl/src/matrix_operations.cpp
@@ -134,7 +134,6 @@ void cv::ocl::oclMat::upload(const Mat &m)
     Size wholeSize;
     Point ofs;
     m.locateROI(wholeSize, ofs);
-
     create(wholeSize, m.type());
 
     if (m.channels() == 3)
@@ -142,13 +141,12 @@ void cv::ocl::oclMat::upload(const Mat &m)
         int pitch = wholeSize.width * 3 * m.elemSize1();
         int tail_padding = m.elemSize1() * 3072;
         int err;
-        cl_mem temp = clCreateBuffer((cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE,
+        cl_mem temp = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE,
                                      (pitch * wholeSize.height + tail_padding - 1) / tail_padding * tail_padding, 0, &err);
         openCLVerifyCall(err);
 
         openCLMemcpy2D(clCxt, temp, pitch, m.datastart, m.step, wholeSize.width * m.elemSize(), wholeSize.height, clMemcpyHostToDevice, 3);
         convert_C3C4(temp, *this);
-
         openCLSafeCall(clReleaseMemObject(temp));
     }
     else
@@ -197,13 +195,12 @@ void cv::ocl::oclMat::download(cv::Mat &m) const
         int pitch = wholecols * 3 * m.elemSize1();
         int tail_padding = m.elemSize1() * 3072;
         int err;
-        cl_mem temp = clCreateBuffer((cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE,
+        cl_mem temp = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE,
                                      (pitch * wholerows + tail_padding - 1) / tail_padding * tail_padding, 0, &err);
         openCLVerifyCall(err);
 
         convert_C4C3(*this, temp);
         openCLMemcpy2D(clCxt, m.data, m.step, temp, pitch, wholecols * m.elemSize(), wholerows, clMemcpyDeviceToHost, 3);
-
         openCLSafeCall(clReleaseMemObject(temp));
     }
     else
@@ -319,7 +316,7 @@ static void convert_run(const oclMat &src, oclMat &dst, double alpha, double bet
 
 void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double beta ) const
 {
-    if (!clCxt->supportsFeature(Context::CL_DOUBLE) &&
+    if (!clCxt->supportsFeature(FEATURE_CL_DOUBLE) &&
             (depth() == CV_64F || dst.depth() == CV_64F))
     {
         CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
@@ -380,7 +377,7 @@ static void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, stri
 #ifdef CL_VERSION_1_2
     // this enables backwards portability to
     // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
-    if (Context::getContext()->supportsFeature(Context::CL_VER_1_2) &&
+    if (Context::getContext()->supportsFeature(FEATURE_CL_VER_1_2) &&
         dst.offset == 0 && dst.cols == dst.wholecols)
     {
         const int sizeofMap[][7] =
@@ -392,7 +389,7 @@ static void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, stri
             };
         int sizeofGeneric = sizeofMap[dst.oclchannels() - 1][dst.depth()];
 
-        clEnqueueFillBuffer((cl_command_queue)dst.clCxt->oclCommandQueue(),
+        clEnqueueFillBuffer(getClCommandQueue(dst.clCxt),
                             (cl_mem)dst.data, (void*)mat.data, sizeofGeneric,
                             0, dst.step * dst.rows, 0, NULL, NULL);
     }
diff --git a/modules/ocl/src/mcwutil.cpp b/modules/ocl/src/mcwutil.cpp
index fc94e2f3d8..e4e2e918fb 100644
--- a/modules/ocl/src/mcwutil.cpp
+++ b/modules/ocl/src/mcwutil.cpp
@@ -101,15 +101,15 @@ namespace cv
             for(size_t i = 0; i < args.size(); i ++)
                 openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
 
-            openCLSafeCall(clEnqueueNDRangeKernel((cl_command_queue)clCxt->oclCommandQueue(), kernel, 3, NULL, globalThreads,
+            openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(), kernel, 3, NULL, globalThreads,
                                                   localThreads, 0, NULL, NULL));
 
             switch(finish_mode)
             {
             case CLFINISH:
-                clFinish((cl_command_queue)clCxt->oclCommandQueue());
+                clFinish(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr());
             case CLFLUSH:
-                clFlush((cl_command_queue)clCxt->oclCommandQueue());
+                clFlush(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr());
                 break;
             case DISABLE:
             default:
@@ -178,7 +178,7 @@ namespace cv
 #ifdef CL_VERSION_1_2
             //this enables backwards portability to
             //run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
-            if(Context::getContext()->supportsFeature(Context::CL_VER_1_2))
+            if(Context::getContext()->supportsFeature(FEATURE_CL_VER_1_2))
             {
                 cl_image_desc desc;
                 desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
@@ -191,13 +191,13 @@ namespace cv
                 desc.buffer           = NULL;
                 desc.num_mip_levels   = 0;
                 desc.num_samples      = 0;
-                texture = clCreateImage((cl_context)mat.clCxt->oclContext(), CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
+                texture = clCreateImage(*(cl_context*)mat.clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
             }
             else
 #endif
             {
                 texture = clCreateImage2D(
-                    (cl_context)mat.clCxt->oclContext(),
+                    *(cl_context*)mat.clCxt->getOpenCLContextPtr(),
                     CL_MEM_READ_WRITE,
                     &format,
                     mat.cols,
@@ -212,22 +212,22 @@ namespace cv
             cl_mem devData;
             if (mat.cols * mat.elemSize() != mat.step)
             {
-                devData = clCreateBuffer((cl_context)mat.clCxt->oclContext(), CL_MEM_READ_ONLY, mat.cols * mat.rows
+                devData = clCreateBuffer(*(cl_context*)mat.clCxt->getOpenCLContextPtr(), CL_MEM_READ_ONLY, mat.cols * mat.rows
                     * mat.elemSize(), NULL, NULL);
                 const size_t regin[3] = {mat.cols * mat.elemSize(), mat.rows, 1};
-                clEnqueueCopyBufferRect((cl_command_queue)mat.clCxt->oclCommandQueue(), (cl_mem)mat.data, devData, origin, origin,
+                clEnqueueCopyBufferRect(*(cl_command_queue*)mat.clCxt->getOpenCLCommandQueuePtr(), (cl_mem)mat.data, devData, origin, origin,
                     regin, mat.step, 0, mat.cols * mat.elemSize(), 0, 0, NULL, NULL);
-                clFlush((cl_command_queue)mat.clCxt->oclCommandQueue());
+                clFlush(*(cl_command_queue*)mat.clCxt->getOpenCLCommandQueuePtr());
             }
             else
             {
                 devData = (cl_mem)mat.data;
             }
 
-            clEnqueueCopyBufferToImage((cl_command_queue)mat.clCxt->oclCommandQueue(), devData, texture, 0, origin, region, 0, NULL, 0);
+            clEnqueueCopyBufferToImage(*(cl_command_queue*)mat.clCxt->getOpenCLCommandQueuePtr(), devData, texture, 0, origin, region, 0, NULL, 0);
             if ((mat.cols * mat.elemSize() != mat.step))
             {
-                clFlush((cl_command_queue)mat.clCxt->oclCommandQueue());
+                clFlush(*(cl_command_queue*)mat.clCxt->getOpenCLCommandQueuePtr());
                 clReleaseMemObject(devData);
             }
 
@@ -259,7 +259,7 @@ namespace cv
             try
             {
                 cv::ocl::openCLGetKernelFromSource(clCxt, &_kernel_string, "test_func");
-                finish();
+                cv::ocl::finish();
                 _support = true;
             }
             catch (const cv::Exception& e)
diff --git a/modules/ocl/src/moments.cpp b/modules/ocl/src/moments.cpp
index 926b94c9b3..24e8b3e0f6 100644
--- a/modules/ocl/src/moments.cpp
+++ b/modules/ocl/src/moments.cpp
@@ -106,7 +106,7 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom )
 
         bool is_float = CV_SEQ_ELTYPE(contour) == CV_32FC2;
 
-        if (!cv::ocl::Context::getContext()->supportsFeature(Context::CL_DOUBLE) && is_float)
+        if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE) && is_float)
         {
             CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!");
         }
@@ -146,7 +146,7 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom )
 
         cv::Mat dst(dst_a);
         a00 = a10 = a01 = a20 = a11 = a02 = a30 = a21 = a12 = a03 = 0.0;
-        if (!cv::ocl::Context::getContext()->supportsFeature(Context::CL_DOUBLE))
+        if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
         {
             for (int i = 0; i < contour->total; ++i)
             {
diff --git a/modules/ocl/src/pyrdown.cpp b/modules/ocl/src/pyrdown.cpp
index 5043da05dc..89df73e9a8 100644
--- a/modules/ocl/src/pyrdown.cpp
+++ b/modules/ocl/src/pyrdown.cpp
@@ -15,8 +15,8 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//		Dachuan Zhao, dachuan@multicorewareinc.com
-//		Yao Wang, yao@multicorewareinc.com
+//        Dachuan Zhao, dachuan@multicorewareinc.com
+//        Yao Wang, yao@multicorewareinc.com
 //
 //
 // Redistribution and use in source and binary forms, with or without modification,
diff --git a/modules/ocl/src/pyrlk.cpp b/modules/ocl/src/pyrlk.cpp
index cdcc8f231f..a69015d190 100644
--- a/modules/ocl/src/pyrlk.cpp
+++ b/modules/ocl/src/pyrlk.cpp
@@ -125,7 +125,7 @@ static void lkSparse_run(oclMat &I, oclMat &J,
     args.push_back( make_pair( sizeof(cl_int), (void *)&iters ));
     args.push_back( make_pair( sizeof(cl_char), (void *)&calcErr ));
 
-    bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
+    bool is_cpu = isCpuDevice();
     if (is_cpu)
     {
         openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), (char*)" -D CPU");
@@ -139,7 +139,7 @@ static void lkSparse_run(oclMat &I, oclMat &J,
             stringstream idxStr;
             idxStr << kernelName << "_C" << I.oclchannels() << "_D" << I.depth();
             cl_kernel kernel = openCLGetKernelFromSource(clCxt, &pyrlk, idxStr.str());
-            int wave_size = queryDeviceInfo<WAVEFRONT_SIZE, int>(kernel);
+            int wave_size = (int)queryWaveFrontSize(kernel);
             openCLSafeCall(clReleaseKernel(kernel));
 
             static char opt[32] = {0};
diff --git a/modules/ocl/src/pyrup.cpp b/modules/ocl/src/pyrup.cpp
index 043031072c..01df30c518 100644
--- a/modules/ocl/src/pyrup.cpp
+++ b/modules/ocl/src/pyrup.cpp
@@ -15,8 +15,8 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//		Zhang Chunpeng chunpeng@multicorewareinc.com
-//		Yao Wang, yao@multicorewareinc.com
+//        Zhang Chunpeng chunpeng@multicorewareinc.com
+//        Yao Wang, yao@multicorewareinc.com
 //
 //
 // Redistribution and use in source and binary forms, with or without modification,
diff --git a/modules/ocl/src/split_merge.cpp b/modules/ocl/src/split_merge.cpp
index 79bd0f0e21..fb8d05aaa7 100644
--- a/modules/ocl/src/split_merge.cpp
+++ b/modules/ocl/src/split_merge.cpp
@@ -75,7 +75,7 @@ namespace cv
         {
             static void merge_vector_run(const oclMat *mat_src, size_t n, oclMat &mat_dst)
             {
-                if(!mat_dst.clCxt->supportsFeature(Context::CL_DOUBLE) && mat_dst.type() == CV_64F)
+                if(!mat_dst.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && mat_dst.type() == CV_64F)
                 {
                     CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
                     return;
@@ -170,7 +170,7 @@ namespace cv
             static void split_vector_run(const oclMat &mat_src, oclMat *mat_dst)
             {
 
-                if(!mat_src.clCxt->supportsFeature(Context::CL_DOUBLE) && mat_src.type() == CV_64F)
+                if(!mat_src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && mat_src.type() == CV_64F)
                 {
                     CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
                     return;
diff --git a/modules/ocl/src/stereo_csbp.cpp b/modules/ocl/src/stereo_csbp.cpp
index 9052dc82bd..c8334cca42 100644
--- a/modules/ocl/src/stereo_csbp.cpp
+++ b/modules/ocl/src/stereo_csbp.cpp
@@ -150,10 +150,10 @@ namespace cv
                 openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&rthis.min_disp_th));
                 openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&left.step));
                 openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&rthis.ndisp));
-                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
+                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 2, NULL,
                     globalThreads, localThreads, 0, NULL, NULL));
 
-                clFinish(*(cl_command_queue*)getoclCommandQueue());
+                clFinish(*(cl_command_queue*)getClCommandQueuePtr());
                 openCLSafeCall(clReleaseKernel(kernel));
             }
 
@@ -200,9 +200,9 @@ namespace cv
                 openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_int),  (void *)&rthis.min_disp_th));
                 openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int),  (void *)&cdisp_step1));
                 openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_int),  (void *)&msg_step));
-                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 3, NULL,
+                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 3, NULL,
                     globalThreads, localThreads, 0, NULL, NULL));
-                clFinish(*(cl_command_queue*)getoclCommandQueue());
+                clFinish(*(cl_command_queue*)getClCommandQueuePtr());
                 openCLSafeCall(clReleaseKernel(kernel));
             }
 
@@ -235,10 +235,10 @@ namespace cv
                 openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&msg_step));
                 openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&disp_step));
                 openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&rthis.ndisp));
-                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
+                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 2, NULL,
                     globalThreads, localThreads, 0, NULL, NULL));
 
-                clFinish(*(cl_command_queue*)getoclCommandQueue());
+                clFinish(*(cl_command_queue*)getClCommandQueuePtr());
                 openCLSafeCall(clReleaseKernel(kernel));
             }
             static void get_first_initial_global_caller(uchar *data_cost_selected, uchar *disp_selected_pyr,
@@ -270,10 +270,10 @@ namespace cv
                 openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&msg_step));
                 openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&disp_step));
                 openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&rthis.ndisp));
-                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
+                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 2, NULL,
                     globalThreads, localThreads, 0, NULL, NULL));
 
-                clFinish(*(cl_command_queue*)getoclCommandQueue());
+                clFinish(*(cl_command_queue*)getClCommandQueuePtr());
                 openCLSafeCall(clReleaseKernel(kernel));
             }
 
@@ -340,10 +340,10 @@ namespace cv
                 openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_float), (void *)&rthis.max_data_term));
                 openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int),  (void *)&left.step));
                 openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_int),  (void *)&rthis.min_disp_th));
-                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
+                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 2, NULL,
                     globalThreads, localThreads, 0, NULL, NULL));
 
-                clFinish(*(cl_command_queue*)getoclCommandQueue());
+                clFinish(*(cl_command_queue*)getClCommandQueuePtr());
                 openCLSafeCall(clReleaseKernel(kernel));
             }
             static void compute_data_cost_reduce_caller(uchar *disp_selected_pyr, uchar *data_cost,
@@ -391,10 +391,10 @@ namespace cv
                 openCLSafeCall(clSetKernelArg(kernel, 17, sizeof(cl_float), (void *)&rthis.max_data_term));
                 openCLSafeCall(clSetKernelArg(kernel, 18, sizeof(cl_int),  (void *)&left.step));
                 openCLSafeCall(clSetKernelArg(kernel, 19, sizeof(cl_int),  (void *)&rthis.min_disp_th));
-                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 3, NULL,
+                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 3, NULL,
                     globalThreads, localThreads, 0, NULL, NULL));
 
-                clFinish(*(cl_command_queue*)getoclCommandQueue());
+                clFinish(*(cl_command_queue*)getClCommandQueuePtr());
                 openCLSafeCall(clReleaseKernel(kernel));
             }
             static void compute_data_cost(uchar *disp_selected_pyr, uchar *data_cost, StereoConstantSpaceBP &rthis,
@@ -458,10 +458,10 @@ namespace cv
                 openCLSafeCall(clSetKernelArg(kernel, 20, sizeof(cl_int), (void *)&disp_step2));
                 openCLSafeCall(clSetKernelArg(kernel, 21, sizeof(cl_int), (void *)&msg_step1));
                 openCLSafeCall(clSetKernelArg(kernel, 22, sizeof(cl_int), (void *)&msg_step2));
-                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
+                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 2, NULL,
                     globalThreads, localThreads, 0, NULL, NULL));
 
-                clFinish(*(cl_command_queue*)getoclCommandQueue());
+                clFinish(*(cl_command_queue*)getClCommandQueuePtr());
                 openCLSafeCall(clReleaseKernel(kernel));
             }
             ////////////////////////////////////////////////////////////////////////////////////////////////
@@ -500,10 +500,10 @@ namespace cv
                 openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int),  (void *)&disp_step));
                 openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int),  (void *)&msg_step));
                 openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_float), (void *)&rthis.disc_single_jump));
-                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
+                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 2, NULL,
                     globalThreads, localThreads, 0, NULL, NULL));
 
-                clFinish(*(cl_command_queue*)getoclCommandQueue());
+                clFinish(*(cl_command_queue*)getClCommandQueuePtr());
                 openCLSafeCall(clReleaseKernel(kernel));
             }
             static void calc_all_iterations(uchar *u, uchar *d, uchar *l, uchar *r, uchar *data_cost_selected,
@@ -552,10 +552,10 @@ namespace cv
                 openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&nr_plane));
                 openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&msg_step));
                 openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&disp_step));
-                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
+                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 2, NULL,
                     globalThreads, localThreads, 0, NULL, NULL));
 
-                clFinish(*(cl_command_queue*)getoclCommandQueue());
+                clFinish(*(cl_command_queue*)getClCommandQueuePtr());
                 openCLSafeCall(clReleaseKernel(kernel));
             }
         }
diff --git a/modules/ocl/src/stereobp.cpp b/modules/ocl/src/stereobp.cpp
index fe9136057b..5bc93aa3f5 100644
--- a/modules/ocl/src/stereobp.cpp
+++ b/modules/ocl/src/stereobp.cpp
@@ -95,7 +95,10 @@ namespace cv
                 con_struct -> cmax_disc_term    = max_disc_term;
                 con_struct -> cdisc_single_jump = disc_single_jump;
 
-                cl_con_struct = load_constant(*((cl_context*)getoclContext()), *((cl_command_queue*)getoclCommandQueue()), (void *)con_struct,
+                Context* clCtx = Context::getContext();
+                cl_context clContext = *(cl_context*)(clCtx->getOpenCLContextPtr());
+                cl_command_queue clCmdQueue = *(cl_command_queue*)(clCtx->getOpenCLCommandQueuePtr());
+                cl_con_struct = load_constant(clContext, clCmdQueue, (void *)con_struct,
                                               sizeof(con_struct_t));
 
                 delete con_struct;
diff --git a/modules/ocl/src/tvl1flow.cpp b/modules/ocl/src/tvl1flow.cpp
index 606ac530f7..c9a3f7abc1 100644
--- a/modules/ocl/src/tvl1flow.cpp
+++ b/modules/ocl/src/tvl1flow.cpp
@@ -15,7 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//		Jin Ma, jin@multicorewareinc.com
+//        Jin Ma, jin@multicorewareinc.com
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
diff --git a/modules/ocl/test/main.cpp b/modules/ocl/test/main.cpp
index 594c196a59..8071102bad 100644
--- a/modules/ocl/test/main.cpp
+++ b/modules/ocl/test/main.cpp
@@ -80,18 +80,18 @@ int main(int argc, char **argv)
     const char *keys =
         "{ h | help     | false              | print help message }"
         "{ t | type     | gpu                | set device type:cpu or gpu}"
-        "{ p | platform | 0                  | set platform id }"
+        "{ p | platform | -1                 | set platform id }"
         "{ d | device   | 0                  | set device id }";
 
     CommandLineParser cmd(argc, argv, keys);
     if (cmd.get<bool>("help"))
     {
-        cout << "Avaible options besides goole test option:" << endl;
+        cout << "Available options besides google test option:" << endl;
         cmd.printParams();
         return 0;
     }
     string type = cmd.get<string>("type");
-    unsigned int pid = cmd.get<unsigned int>("platform");
+    int pid = cmd.get<int>("platform");
     int device = cmd.get<int>("device");
 
     print_info();
@@ -100,24 +100,29 @@ int main(int argc, char **argv)
     {
         flag = CVCL_DEVICE_TYPE_CPU;
     }
-    std::vector<cv::ocl::Info> oclinfo;
-    int devnums = getDevice(oclinfo, flag);
-    if(devnums <= device || device < 0)
+
+    cv::ocl::PlatformsInfo platformsInfo;
+    cv::ocl::getOpenCLPlatforms(platformsInfo);
+    if (pid >= (int)platformsInfo.size())
     {
-        std::cout << "device invalid\n";
-        return -1;
+        std::cout << "platform is invalid\n";
+        return 1;
     }
-    if(pid >= oclinfo.size())
+
+    cv::ocl::DevicesInfo devicesInfo;
+    int devnums = cv::ocl::getOpenCLDevices(devicesInfo, flag, (pid < 0) ? NULL : platformsInfo[pid]);
+    if (device < 0 || device >= devnums)
     {
-        std::cout << "platform invalid\n";
-        return -1;
+        std::cout << "device/platform invalid\n";
+        return 1;
     }
 
-    setDevice(oclinfo[pid], device);
-
+    cv::ocl::setDevice(devicesInfo[device]);
     setBinaryDiskCache(CACHE_UPDATE);
 
-    cout << "Device type:" << type << endl << "Device name:" << oclinfo[pid].DeviceName[device] << endl;
+    cout << "Device type: " << type << endl
+            << "Platform name: " << devicesInfo[device]->platform->platformName << endl
+            << "Device name: " << devicesInfo[device]->deviceName << endl;
     return RUN_ALL_TESTS();
 }
 
diff --git a/modules/superres/perf/perf_superres_ocl.cpp b/modules/superres/perf/perf_superres_ocl.cpp
index 0b9864cbd3..822b87f441 100644
--- a/modules/superres/perf/perf_superres_ocl.cpp
+++ b/modules/superres/perf/perf_superres_ocl.cpp
@@ -107,9 +107,6 @@ PERF_TEST_P(Size_MatType, SuperResolution_BTVL1_OCL,
     Combine(Values(szSmall64, szSmall128),
     Values(MatType(CV_8UC1), MatType(CV_8UC3))))
 {
-    std::vector<cv::ocl::Info>info;
-    cv::ocl::getDevice(info);
-
     declare.time(5 * 60);
 
     const Size size = std::tr1::get<0>(GetParam());
diff --git a/modules/superres/src/btv_l1_ocl.cpp b/modules/superres/src/btv_l1_ocl.cpp
index 2f27d50259..5aecca0630 100644
--- a/modules/superres/src/btv_l1_ocl.cpp
+++ b/modules/superres/src/btv_l1_ocl.cpp
@@ -232,7 +232,7 @@ void btv_l1_device_ocl::calcBtvRegularization(const oclMat& src, oclMat& dst, in
     cl_mem c_btvRegWeights;
     size_t count = btvWeights_size * sizeof(float);
     c_btvRegWeights = openCLCreateBuffer(clCxt, CL_MEM_READ_ONLY, count);
-    int cl_safe_check = clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), c_btvRegWeights, 1, 0, count, btvWeights_, 0, NULL, NULL);
+    int cl_safe_check = clEnqueueWriteBuffer(getClCommandQueue(clCxt), c_btvRegWeights, 1, 0, count, btvWeights_, 0, NULL, NULL);
     CV_Assert(cl_safe_check == CL_SUCCESS);
 
     args.push_back(make_pair(sizeof(cl_mem), (void*)&src_.data));
diff --git a/modules/superres/test/test_superres.cpp b/modules/superres/test/test_superres.cpp
index 1530d6d667..5cb078f77c 100644
--- a/modules/superres/test/test_superres.cpp
+++ b/modules/superres/test/test_superres.cpp
@@ -278,8 +278,6 @@ TEST_F(SuperResolution, BTVL1_GPU)
 #if defined(HAVE_OPENCV_OCL) && defined(HAVE_OPENCL)
 TEST_F(SuperResolution, BTVL1_OCL)
 {
-    std::vector<cv::ocl::Info> infos;
-    cv::ocl::getDevice(infos);
     RunTest(cv::superres::createSuperResolution_BTVL1_OCL());
 }
 #endif