From 5f56b276160b5a4dfcbcef73e0c418104516d4f4 Mon Sep 17 00:00:00 2001
From: Anatoly Baksheev <no@email>
Date: Thu, 31 Mar 2011 10:55:39 +0000
Subject: [PATCH] added ROI support for HOG_GPU

---
 modules/gpu/src/cuda/hog.cu | 123 ++++++++++++++----------------------
 1 file changed, 48 insertions(+), 75 deletions(-)
diff --git a/modules/gpu/src/cuda/hog.cu b/modules/gpu/src/cuda/hog.cu
index 19bd0ed61d..c358ef3c3f 100644
--- a/modules/gpu/src/cuda/hog.cu
+++ b/modules/gpu/src/cuda/hog.cu
@@ -50,10 +50,6 @@
   #endif
 #endif
 
-#ifndef div_up
-#define div_up(n, grain) (((n) + (grain) - 1) / (grain))
-#endif
-
 // Other values are not supported
 #define CELL_WIDTH 8
 #define CELL_HEIGHT 8
@@ -208,7 +204,7 @@ void compute_hists(int nbins, int block_stride_x, int block_stride_y,
     int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / 
                            block_stride_y;
 
-    dim3 grid(div_up(img_block_width, nblocks), img_block_height);
+    dim3 grid(divUp(img_block_width, nblocks), img_block_height);
     dim3 threads(32, 2, nblocks);
 
     cudaSafeCall(cudaFuncSetCacheConfig(compute_hists_kernel_many_blocks<nblocks>, 
@@ -311,7 +307,7 @@ void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
 
     int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
     int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y;
-    dim3 grid(div_up(img_block_width, nblocks), img_block_height);
+    dim3 grid(divUp(img_block_width, nblocks), img_block_height);
 
     if (nthreads == 32)
         normalize_hists_kernel_many_blocks<32, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
@@ -395,9 +391,7 @@ __global__ void classify_hists_kernel_many_blocks(const int img_win_width, const
     }
 
     if (threadIdx.x == 0)
-        labels[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] 
-            = (product + free_coef >= threshold);
-
+        labels[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = (product + free_coef >= threshold);
 }
 
 
@@ -414,13 +408,11 @@ void classify_hists(int win_height, int win_width, int block_stride_y, int block
     int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
 
     dim3 threads(nthreads, 1, nblocks);
-    dim3 grid(div_up(img_win_width, nblocks), img_win_height);
+    dim3 grid(divUp(img_win_width, nblocks), img_win_height);
 
-    cudaSafeCall(cudaFuncSetCacheConfig(classify_hists_kernel_many_blocks<nthreads, nblocks>, 
-                                        cudaFuncCachePreferL1));
+    cudaSafeCall(cudaFuncSetCacheConfig(classify_hists_kernel_many_blocks<nthreads, nblocks>, cudaFuncCachePreferL1));
 
-    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / 
-                          block_stride_x;
+    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
     classify_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
         img_win_width, img_block_width, win_block_stride_x, win_block_stride_y, 
         block_hists, coefs, free_coef, threshold, labels);
@@ -434,9 +426,8 @@ void classify_hists(int win_height, int win_width, int block_stride_y, int block
 
 
 template <int nthreads>
-__global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, 
-                                              const int win_block_stride_y, const float* block_hists, 
-                                              PtrElemStepf descriptors)
+__global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, const int win_block_stride_y, 
+											  const float* block_hists, PtrElemStepf descriptors)
 {
     // Get left top corner of the window in src
     const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + 
@@ -455,9 +446,8 @@ __global__ void extract_descrs_by_rows_kernel(const int img_block_width, const i
 }
 
 
-void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, 
-                            int win_stride_y, int win_stride_x, int height, int width, float* block_hists, 
-                            DevMem2Df descriptors)
+void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x, 
+							int height, int width, float* block_hists, DevMem2Df descriptors)
 {
     const int nthreads = 256;
 
@@ -468,8 +458,7 @@ void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, i
     dim3 threads(nthreads, 1);
     dim3 grid(img_win_width, img_win_height);
 
-    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / 
-                          block_stride_x;
+    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
     extract_descrs_by_rows_kernel<nthreads><<<grid, threads>>>(
         img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
     cudaSafeCall( cudaGetLastError() );
@@ -640,21 +629,17 @@ void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2D& im
     const int nthreads = 256;
 
     dim3 bdim(nthreads, 1);
-    dim3 gdim(div_up(width, bdim.x), div_up(height, bdim.y));
+    dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y));
 
     if (correct_gamma)
-        compute_gradients_8UC4_kernel<nthreads, 1><<<gdim, bdim>>>(
-                height, width, img, angle_scale, grad, qangle);
+        compute_gradients_8UC4_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
     else
-        compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>(
-                height, width, img, angle_scale, grad, qangle);
+        compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
 
     cudaSafeCall( cudaGetLastError() );
-
     cudaSafeCall(cudaThreadSynchronize());
 }
 
-
 template <int nthreads, int correct_gamma>
 __global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrElemStep img, 
                                               float angle_scale, PtrElemStepf grad, PtrElemStep qangle)
@@ -715,17 +700,14 @@ void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2D& im
     const int nthreads = 256;
 
     dim3 bdim(nthreads, 1);
-    dim3 gdim(div_up(width, bdim.x), div_up(height, bdim.y));
+    dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y));
 
     if (correct_gamma)
-        compute_gradients_8UC1_kernel<nthreads, 1><<<gdim, bdim>>>(
-                height, width, img, angle_scale, grad, qangle);
+        compute_gradients_8UC1_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
     else
-        compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>(
-                height, width, img, angle_scale, grad, qangle);
+        compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
 
     cudaSafeCall( cudaGetLastError() );
-
     cudaSafeCall(cudaThreadSynchronize());
 }
 
@@ -735,67 +717,58 @@ void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2D& im
 // Resize
 
 texture<uchar4, 2, cudaReadModeNormalizedFloat> resize8UC4_tex;
-texture<unsigned char, 2, cudaReadModeNormalizedFloat> resize8UC1_tex;
-
+texture<uchar,  2, cudaReadModeNormalizedFloat> resize8UC1_tex;
 
-extern "C" __global__ void resize_8UC4_kernel(float sx, float sy, DevMem2D dst)
+__global__ void resize_for_hog_kernel(float sx, float sy, DevMem2D_<uchar> dst, int colOfs)
 {
     unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
     unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
 
     if (x < dst.cols && y < dst.rows)
-    {
-        float4 val = tex2D(resize8UC4_tex, x * sx, y * sy);
-        ((uchar4*)dst.ptr(y))[x] = make_uchar4(val.x * 255, val.y * 255, val.z * 255, val.w * 255);
-    }
+        ((unsigned char*)dst.ptr(y))[x] = tex2D(resize8UC1_tex, x * sx + colOfs, y * sy) * 255;
 }
 
-
-void resize_8UC4(const DevMem2D& src, DevMem2D dst)
-{
-    cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
-    cudaBindTexture2D(0, resize8UC4_tex, src.data, desc, src.cols, src.rows, src.step);
-    resize8UC4_tex.filterMode = cudaFilterModeLinear;
-
-    dim3 threads(32, 8);
-    dim3 grid(div_up(dst.cols, threads.x), div_up(dst.rows, threads.y));
-    float sx = (float)src.cols / dst.cols;
-    float sy = (float)src.rows / dst.rows;
-    resize_8UC4_kernel<<<grid, threads>>>(sx, sy, dst);
-    cudaSafeCall( cudaGetLastError() );
-
-    cudaSafeCall(cudaThreadSynchronize());
-
-    cudaSafeCall(cudaUnbindTexture(resize8UC4_tex));
-}
-
-
-extern "C" __global__ void resize_8UC1_kernel(float sx, float sy, DevMem2D dst)
+__global__ void resize_for_hog_kernel(float sx, float sy, DevMem2D_<uchar4> dst, int colOfs)
 {
     unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
     unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
 
     if (x < dst.cols && y < dst.rows)
-        ((unsigned char*)dst.ptr(y))[x] = tex2D(resize8UC1_tex, x * sx, y * sy) * 255;
+	{        
+		float4 val = tex2D(resize8UC4_tex, x * sx + colOfs, y * sy);
+        dst.ptr(y)[x] = make_uchar4(val.x * 255, val.y * 255, val.z * 255, val.w * 255);
+	}
 }
 
-
-void resize_8UC1(const DevMem2D& src, DevMem2D dst)
+template<class T, class TEX> 
+static void resize_for_hog(const DevMem2D& src, DevMem2D dst, TEX& tex)
 {
-    cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
-    cudaBindTexture2D(0, resize8UC1_tex, src.data, desc, src.cols, src.rows, src.step);
-    resize8UC1_tex.filterMode = cudaFilterModeLinear;
+    tex.filterMode = cudaFilterModeLinear;
+
+    size_t texOfs = 0;
+    int colOfs = 0;
+
+    cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();    
+    cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
+
+    if (texOfs != 0) 
+    {
+        colOfs = static_cast<int>( texOfs/sizeof(T) );
+        cudaSafeCall( cudaUnbindTexture(tex) );
+        cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
+    }    
 
     dim3 threads(32, 8);
-    dim3 grid(div_up(dst.cols, threads.x), div_up(dst.rows, threads.y));
-    float sx = (float)src.cols / dst.cols;
-    float sy = (float)src.rows / dst.rows;
-    resize_8UC1_kernel<<<grid, threads>>>(sx, sy, dst);
+    dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
+    float sx = static_cast<float>(src.cols) / dst.cols;
+    float sy = static_cast<float>(src.rows) / dst.rows;
+    resize_for_hog_kernel<<<grid, threads>>>(sx, sy, (DevMem2D_<T>)dst, colOfs);
     cudaSafeCall( cudaGetLastError() );
-
     cudaSafeCall(cudaThreadSynchronize());
-
     cudaSafeCall(cudaUnbindTexture(resize8UC1_tex));
 }
 
+void resize_8UC1(const DevMem2D& src, DevMem2D dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); }
+void resize_8UC4(const DevMem2D& src, DevMem2D dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); }
+
 }}}
\ No newline at end of file