From 40d0e0eda0ddb04099c58127c9d12ace78da8f6e Mon Sep 17 00:00:00 2001
From: yao <bitwangyaoyao@gmail.com>
Date: Sat, 13 Apr 2013 14:58:49 +0800
Subject: [PATCH] use host data when DEVICE_MEM_UHP is set (the risk of vary
 align size is owned by users)

---
 modules/ocl/include/opencv2/ocl.hpp           |  6 ++-
 .../ocl/include/opencv2/ocl/private/util.hpp  |  3 +-
 modules/ocl/src/initialization.cpp            | 19 ++++---
 modules/ocl/src/matrix_operations.cpp         | 50 +++++--------------
 4 files changed, 31 insertions(+), 47 deletions(-)

diff --git a/modules/ocl/include/opencv2/ocl.hpp b/modules/ocl/include/opencv2/ocl.hpp
index cd1429a63f..7cc55b8335 100644
--- a/modules/ocl/include/opencv2/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl.hpp
@@ -263,8 +263,10 @@ namespace cv
             void create(Size size, int type);
 
             //! allocates new oclMatrix with specified device memory type.
-            void createEx(int rows, int cols, int type, DevMemRW rw_type, DevMemType mem_type);
-            void createEx(Size size, int type, DevMemRW rw_type, DevMemType mem_type);
+            void createEx(int rows, int cols, int type, 
+                          DevMemRW rw_type, DevMemType mem_type, void* hptr = 0);
+            void createEx(Size size, int type, DevMemRW rw_type, 
+                          DevMemType mem_type, void* hptr = 0);
 
             //! decreases reference counter;
             // deallocate the data when reference counter reaches 0.
diff --git a/modules/ocl/include/opencv2/ocl/private/util.hpp b/modules/ocl/include/opencv2/ocl/private/util.hpp
index 9ac032a29a..786f2d6192 100644
--- a/modules/ocl/include/opencv2/ocl/private/util.hpp
+++ b/modules/ocl/include/opencv2/ocl/private/util.hpp
@@ -68,7 +68,8 @@ namespace cv
         void CV_EXPORTS openCLMallocPitch(Context *clCxt, void **dev_ptr, size_t *pitch,
                                           size_t widthInBytes, size_t height);
         void CV_EXPORTS openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch,
-                                            size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type);
+                                            size_t widthInBytes, size_t height, 
+                                            DevMemRW rw_type, DevMemType mem_type, void* hptr = 0);
         void CV_EXPORTS openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
                                        const void *src, size_t spitch,
                                        size_t width, size_t height, openCLMemcpyKind kind, int channels = -1);
diff --git a/modules/ocl/src/initialization.cpp b/modules/ocl/src/initialization.cpp
index 37a69cece5..9394b7e9d0 100644
--- a/modules/ocl/src/initialization.cpp
+++ b/modules/ocl/src/initialization.cpp
@@ -163,7 +163,7 @@ namespace cv
                 {
                     releaseResources();
                     delete this;
-            }
+                }
             }
 
             Impl* copy()
@@ -260,9 +260,8 @@ namespace cv
 
         int setDevMemType(DevMemRW rw_type, DevMemType mem_type)
         {
-            if( (mem_type == DEVICE_MEM_PM && Context::getContext()->impl->unified_memory == 0) ||
-                 mem_type == DEVICE_MEM_UHP ||
-                 mem_type == DEVICE_MEM_CHP )
+            if( (mem_type == DEVICE_MEM_PM && 
+                 Context::getContext()->impl->unified_memory == 0) )
                 return -1;
             gDeviceMemRW = rw_type;
             gDeviceMemType = mem_type;
@@ -432,11 +431,17 @@ namespace cv
         }
 
         void openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch,
-                               size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type)
+                                 size_t widthInBytes, size_t height, 
+                                 DevMemRW rw_type, DevMemType mem_type, void* hptr)
         {
             cl_int status;
-            *dev_ptr = clCreateBuffer(clCxt->impl->oclcontext, gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
-                                      widthInBytes * height, 0, &status);
+            if(hptr && (mem_type==DEVICE_MEM_UHP || mem_type==DEVICE_MEM_CHP))
+                *dev_ptr = clCreateBuffer(clCxt->impl->oclcontext, 
+                                          gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type], 
+                                          widthInBytes * height, hptr, &status);
+            else
+                *dev_ptr = clCreateBuffer(clCxt->impl->oclcontext, gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
+                                          widthInBytes * height, 0, &status);
             openCLVerifyCall(status);
             *pitch = widthInBytes;
         }
diff --git a/modules/ocl/src/matrix_operations.cpp b/modules/ocl/src/matrix_operations.cpp
index 7cd596f333..4d697a2d50 100644
--- a/modules/ocl/src/matrix_operations.cpp
+++ b/modules/ocl/src/matrix_operations.cpp
@@ -177,15 +177,9 @@ void cv::ocl::oclMat::upload(const Mat &m)
     Size wholeSize;
     Point ofs;
     m.locateROI(wholeSize, ofs);
-    //   int type = m.type();
-    //   if(m.oclchannels() == 3)
-    //{
-    //	type = CV_MAKETYPE(m.depth(), 4);
-    //}
-    create(wholeSize, m.type());
-
     if(m.channels() == 3)
     {
+        create(wholeSize, m.type());
         int pitch = wholeSize.width * 3 * m.elemSize1();
         int tail_padding = m.elemSize1() * 3072;
         int err;
@@ -195,35 +189,20 @@ void cv::ocl::oclMat::upload(const Mat &m)
 
         openCLMemcpy2D(clCxt, temp, pitch, m.datastart, m.step, wholeSize.width * m.elemSize(), wholeSize.height, clMemcpyHostToDevice, 3);
         convert_C3C4(temp, *this);
-        //int* cputemp=new int[wholeSize.height*wholeSize.width * 3];
-        //int* cpudata=new int[this->step*this->wholerows/sizeof(int)];
-        //openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, temp, CL_TRUE,
-        //						0, wholeSize.height*wholeSize.width * 3* sizeof(int), cputemp, 0, NULL, NULL));
-        //openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)data, CL_TRUE,
-        //						0, this->step*this->wholerows, cpudata, 0, NULL, NULL));
-        //for(int i=0;i<wholeSize.height;i++)
-        //{
-        //	int *a = cputemp+i*wholeSize.width * 3,*b = cpudata + i*this->step/sizeof(int);
-        //	for(int j=0;j<wholeSize.width;j++)
-        //	{
-        //		if((a[3*j] != b[4*j])||(a[3*j+1] != b[4*j+1])||(a[3*j+2] != b[4*j+2]))
-        //			printf("rows=%d,cols=%d,cputtemp=%d,%d,%d;cpudata=%d,%d,%d\n",
-        //			i,j,a[3*j],a[3*j+1],a[3*j+2],b[4*j],b[4*j+1],b[4*j+2]);
-        //	}
-        //}
-        //delete []cputemp;
-        //delete []cpudata;
         openCLSafeCall(clReleaseMemObject(temp));
     }
     else
     {
-        openCLMemcpy2D(clCxt, data, step, m.datastart, m.step, wholeSize.width * elemSize(), wholeSize.height, clMemcpyHostToDevice);
+        // try to use host ptr
+        createEx(wholeSize, m.type(), gDeviceMemRW, gDeviceMemType, m.datastart);
+        if(gDeviceMemType!=DEVICE_MEM_UHP && gDeviceMemType!=DEVICE_MEM_CHP)
+            openCLMemcpy2D(clCxt, data, step, m.datastart, m.step, 
+                           wholeSize.width * elemSize(), wholeSize.height, clMemcpyHostToDevice);
     }
 
     rows = m.rows;
     cols = m.cols;
     offset = ofs.y * step + ofs.x * elemSize();
-    //download_channels = m.channels();
 }
 
 void cv::ocl::oclMat::download(cv::Mat &m) const
@@ -908,9 +887,10 @@ oclMat cv::ocl::oclMat::reshape(int new_cn, int new_rows) const
 
 }
 
-void cv::ocl::oclMat::createEx(Size size, int type, DevMemRW rw_type, DevMemType mem_type)
+void cv::ocl::oclMat::createEx(Size size, int type, 
+                               DevMemRW rw_type, DevMemType mem_type, void* hptr)
 {
-    createEx(size.height, size.width, type, rw_type, mem_type);
+    createEx(size.height, size.width, type, rw_type, mem_type, hptr);
 }
 
 void cv::ocl::oclMat::create(int _rows, int _cols, int _type)
@@ -918,16 +898,12 @@ void cv::ocl::oclMat::create(int _rows, int _cols, int _type)
     createEx(_rows, _cols, _type, gDeviceMemRW, gDeviceMemType);
 }
 
-void cv::ocl::oclMat::createEx(int _rows, int _cols, int _type, DevMemRW rw_type, DevMemType mem_type)
+void cv::ocl::oclMat::createEx(int _rows, int _cols, int _type, 
+                               DevMemRW rw_type, DevMemType mem_type, void* hptr)
 {
     clCxt = Context::getContext();
     /* core logic */
     _type &= Mat::TYPE_MASK;
-    //download_channels = CV_MAT_CN(_type);
-    //if(download_channels==3)
-    //{
-    //	_type = CV_MAKE_TYPE((CV_MAT_DEPTH(_type)),4);
-    //}
     if( rows == _rows && cols == _cols && type() == _type && data )
         return;
     if( data )
@@ -943,8 +919,8 @@ void cv::ocl::oclMat::createEx(int _rows, int _cols, int _type, DevMemRW rw_type
         size_t esz = elemSize();
 
         void *dev_ptr;
-        openCLMallocPitchEx(clCxt, &dev_ptr, &step, GPU_MATRIX_MALLOC_STEP(esz * cols), rows, rw_type, mem_type);
-        //openCLMallocPitch(clCxt,&dev_ptr, &step, esz * cols, rows);
+        openCLMallocPitchEx(clCxt, &dev_ptr, &step, GPU_MATRIX_MALLOC_STEP(esz * cols), 
+                            rows, rw_type, mem_type, hptr);
 
         if (esz * cols == step)
             flags |= Mat::CONTINUOUS_FLAG;