added ROI support for HOG_GPU

14 years ago · 5f56b27616
parent 400dbb13db
commit 5f56b27616
1 changed files with 48 additions and 75 deletions
--- a/modules/gpu/src/cuda/hog.cu
+++ b/modules/gpu/src/cuda/hog.cu
@ -50,10 +50,6 @@
  #endif
 #endif

-#ifndef div_up
-#define div_up(n, grain) (((n) + (grain) - 1) / (grain))
-#endif
-
 // Other values are not supported
 #define CELL_WIDTH 8
 #define CELL_HEIGHT 8
@ -208,7 +204,7 @@ void compute_hists(int nbins, int block_stride_x, int block_stride_y,
    int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / 
                           block_stride_y;

-    dim3 grid(div_up(img_block_width, nblocks), img_block_height);
+    dim3 grid(divUp(img_block_width, nblocks), img_block_height);
    dim3 threads(32, 2, nblocks);

    cudaSafeCall(cudaFuncSetCacheConfig(compute_hists_kernel_many_blocks<nblocks>, 
@ -311,7 +307,7 @@ void normalize_hists(int nbins, int block_stride_x, int block_stride_y,

    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
    int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y;
-    dim3 grid(div_up(img_block_width, nblocks), img_block_height);
+    dim3 grid(divUp(img_block_width, nblocks), img_block_height);

    if (nthreads == 32)
        normalize_hists_kernel_many_blocks<32, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
@ -395,9 +391,7 @@ __global__ void classify_hists_kernel_many_blocks(const int img_win_width, const
    }

    if (threadIdx.x == 0)
-        labels[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] 
-            = (product + free_coef >= threshold);
-
+        labels[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = (product + free_coef >= threshold);
 }


@ -414,13 +408,11 @@ void classify_hists(int win_height, int win_width, int block_stride_y, int block
    int img_win_height = (height - win_height + win_stride_y) / win_stride_y;

    dim3 threads(nthreads, 1, nblocks);
-    dim3 grid(div_up(img_win_width, nblocks), img_win_height);
+    dim3 grid(divUp(img_win_width, nblocks), img_win_height);

-    cudaSafeCall(cudaFuncSetCacheConfig(classify_hists_kernel_many_blocks<nthreads, nblocks>, 
-                                        cudaFuncCachePreferL1));
+    cudaSafeCall(cudaFuncSetCacheConfig(classify_hists_kernel_many_blocks<nthreads, nblocks>, cudaFuncCachePreferL1));

-    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / 
-                          block_stride_x;
+    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
    classify_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
        img_win_width, img_block_width, win_block_stride_x, win_block_stride_y, 
        block_hists, coefs, free_coef, threshold, labels);
@ -434,9 +426,8 @@ void classify_hists(int win_height, int win_width, int block_stride_y, int block


 template <int nthreads>
-__global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, 
-                                              const int win_block_stride_y, const float* block_hists, 
-                                              PtrElemStepf descriptors)
+__global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, const int win_block_stride_y, 
+											  const float* block_hists, PtrElemStepf descriptors)
 {
    // Get left top corner of the window in src
    const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + 
@ -455,9 +446,8 @@ __global__ void extract_descrs_by_rows_kernel(const int img_block_width, const i
 }


-void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, 
-                            int win_stride_y, int win_stride_x, int height, int width, float* block_hists, 
-                            DevMem2Df descriptors)
+void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x, 
+							int height, int width, float* block_hists, DevMem2Df descriptors)
 {
    const int nthreads = 256;

@ -468,8 +458,7 @@ void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, i
    dim3 threads(nthreads, 1);
    dim3 grid(img_win_width, img_win_height);

-    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / 
-                          block_stride_x;
+    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
    extract_descrs_by_rows_kernel<nthreads><<<grid, threads>>>(
        img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
    cudaSafeCall( cudaGetLastError() );
@ -640,21 +629,17 @@ void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2D& im
    const int nthreads = 256;

    dim3 bdim(nthreads, 1);
-    dim3 gdim(div_up(width, bdim.x), div_up(height, bdim.y));
+    dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y));

    if (correct_gamma)
-        compute_gradients_8UC4_kernel<nthreads, 1><<<gdim, bdim>>>(
-                height, width, img, angle_scale, grad, qangle);
+        compute_gradients_8UC4_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
    else
-        compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>(
-                height, width, img, angle_scale, grad, qangle);
+        compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);

    cudaSafeCall( cudaGetLastError() );
-
    cudaSafeCall(cudaThreadSynchronize());
 }

-
 template <int nthreads, int correct_gamma>
 __global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrElemStep img, 
                                              float angle_scale, PtrElemStepf grad, PtrElemStep qangle)
@ -715,17 +700,14 @@ void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2D& im
    const int nthreads = 256;

    dim3 bdim(nthreads, 1);
-    dim3 gdim(div_up(width, bdim.x), div_up(height, bdim.y));
+    dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y));

    if (correct_gamma)
-        compute_gradients_8UC1_kernel<nthreads, 1><<<gdim, bdim>>>(
-                height, width, img, angle_scale, grad, qangle);
+        compute_gradients_8UC1_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
    else
-        compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>(
-                height, width, img, angle_scale, grad, qangle);
+        compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);

    cudaSafeCall( cudaGetLastError() );
-
    cudaSafeCall(cudaThreadSynchronize());
 }

@ -735,67 +717,58 @@ void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2D& im
 // Resize

 texture<uchar4, 2, cudaReadModeNormalizedFloat> resize8UC4_tex;
-texture<unsigned char, 2, cudaReadModeNormalizedFloat> resize8UC1_tex;
-
+texture<uchar,  2, cudaReadModeNormalizedFloat> resize8UC1_tex;

-extern "C" __global__ void resize_8UC4_kernel(float sx, float sy, DevMem2D dst)
+__global__ void resize_for_hog_kernel(float sx, float sy, DevMem2D_<uchar> dst, int colOfs)
 {
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < dst.cols && y < dst.rows)
-    {
-        float4 val = tex2D(resize8UC4_tex, x * sx, y * sy);
-        ((uchar4*)dst.ptr(y))[x] = make_uchar4(val.x * 255, val.y * 255, val.z * 255, val.w * 255);
-    }
+        ((unsigned char*)dst.ptr(y))[x] = tex2D(resize8UC1_tex, x * sx + colOfs, y * sy) * 255;
 }

-
-void resize_8UC4(const DevMem2D& src, DevMem2D dst)
-{
-    cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
-    cudaBindTexture2D(0, resize8UC4_tex, src.data, desc, src.cols, src.rows, src.step);
-    resize8UC4_tex.filterMode = cudaFilterModeLinear;
-
-    dim3 threads(32, 8);
-    dim3 grid(div_up(dst.cols, threads.x), div_up(dst.rows, threads.y));
-    float sx = (float)src.cols / dst.cols;
-    float sy = (float)src.rows / dst.rows;
-    resize_8UC4_kernel<<<grid, threads>>>(sx, sy, dst);
-    cudaSafeCall( cudaGetLastError() );
-
-    cudaSafeCall(cudaThreadSynchronize());
-
-    cudaSafeCall(cudaUnbindTexture(resize8UC4_tex));
-}
-
-
-extern "C" __global__ void resize_8UC1_kernel(float sx, float sy, DevMem2D dst)
+__global__ void resize_for_hog_kernel(float sx, float sy, DevMem2D_<uchar4> dst, int colOfs)
 {
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < dst.cols && y < dst.rows)
-        ((unsigned char*)dst.ptr(y))[x] = tex2D(resize8UC1_tex, x * sx, y * sy) * 255;
+	{        
+		float4 val = tex2D(resize8UC4_tex, x * sx + colOfs, y * sy);
+        dst.ptr(y)[x] = make_uchar4(val.x * 255, val.y * 255, val.z * 255, val.w * 255);
+	}
 }

-
-void resize_8UC1(const DevMem2D& src, DevMem2D dst)
+template<class T, class TEX> 
+static void resize_for_hog(const DevMem2D& src, DevMem2D dst, TEX& tex)
 {
-    cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
-    cudaBindTexture2D(0, resize8UC1_tex, src.data, desc, src.cols, src.rows, src.step);
-    resize8UC1_tex.filterMode = cudaFilterModeLinear;
+    tex.filterMode = cudaFilterModeLinear;
+
+    size_t texOfs = 0;
+    int colOfs = 0;
+
+    cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();    
+    cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
+
+    if (texOfs != 0) 
+    {
+        colOfs = static_cast<int>( texOfs/sizeof(T) );
+        cudaSafeCall( cudaUnbindTexture(tex) );
+        cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
+    }    

    dim3 threads(32, 8);
-    dim3 grid(div_up(dst.cols, threads.x), div_up(dst.rows, threads.y));
-    float sx = (float)src.cols / dst.cols;
-    float sy = (float)src.rows / dst.rows;
-    resize_8UC1_kernel<<<grid, threads>>>(sx, sy, dst);
+    dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
+    float sx = static_cast<float>(src.cols) / dst.cols;
+    float sy = static_cast<float>(src.rows) / dst.rows;
+    resize_for_hog_kernel<<<grid, threads>>>(sx, sy, (DevMem2D_<T>)dst, colOfs);
    cudaSafeCall( cudaGetLastError() );
-
    cudaSafeCall(cudaThreadSynchronize());
-
    cudaSafeCall(cudaUnbindTexture(resize8UC1_tex));
 }

+void resize_8UC1(const DevMem2D& src, DevMem2D dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); }
+void resize_8UC4(const DevMem2D& src, DevMem2D dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); }
+
 }}}