From 5f56b276160b5a4dfcbcef73e0c418104516d4f4 Mon Sep 17 00:00:00 2001 From: Anatoly Baksheev Date: Thu, 31 Mar 2011 10:55:39 +0000 Subject: [PATCH] added ROI support for HOG_GPU --- modules/gpu/src/cuda/hog.cu | 123 ++++++++++++++---------------------- 1 file changed, 48 insertions(+), 75 deletions(-) diff --git a/modules/gpu/src/cuda/hog.cu b/modules/gpu/src/cuda/hog.cu index 19bd0ed61d..c358ef3c3f 100644 --- a/modules/gpu/src/cuda/hog.cu +++ b/modules/gpu/src/cuda/hog.cu @@ -50,10 +50,6 @@ #endif #endif -#ifndef div_up -#define div_up(n, grain) (((n) + (grain) - 1) / (grain)) -#endif - // Other values are not supported #define CELL_WIDTH 8 #define CELL_HEIGHT 8 @@ -208,7 +204,7 @@ void compute_hists(int nbins, int block_stride_x, int block_stride_y, int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y; - dim3 grid(div_up(img_block_width, nblocks), img_block_height); + dim3 grid(divUp(img_block_width, nblocks), img_block_height); dim3 threads(32, 2, nblocks); cudaSafeCall(cudaFuncSetCacheConfig(compute_hists_kernel_many_blocks, @@ -311,7 +307,7 @@ void normalize_hists(int nbins, int block_stride_x, int block_stride_y, int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x; int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y; - dim3 grid(div_up(img_block_width, nblocks), img_block_height); + dim3 grid(divUp(img_block_width, nblocks), img_block_height); if (nthreads == 32) normalize_hists_kernel_many_blocks<32, nblocks><<>>(block_hist_size, img_block_width, block_hists, threshold); @@ -395,9 +391,7 @@ __global__ void classify_hists_kernel_many_blocks(const int img_win_width, const } if (threadIdx.x == 0) - labels[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] - = (product + free_coef >= threshold); - + labels[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = (product + free_coef >= threshold); } @@ -414,13 +408,11 @@ void classify_hists(int win_height, int win_width, int block_stride_y, int block int img_win_height = (height - win_height + win_stride_y) / win_stride_y; dim3 threads(nthreads, 1, nblocks); - dim3 grid(div_up(img_win_width, nblocks), img_win_height); + dim3 grid(divUp(img_win_width, nblocks), img_win_height); - cudaSafeCall(cudaFuncSetCacheConfig(classify_hists_kernel_many_blocks, - cudaFuncCachePreferL1)); + cudaSafeCall(cudaFuncSetCacheConfig(classify_hists_kernel_many_blocks, cudaFuncCachePreferL1)); - int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / - block_stride_x; + int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x; classify_hists_kernel_many_blocks<<>>( img_win_width, img_block_width, win_block_stride_x, win_block_stride_y, block_hists, coefs, free_coef, threshold, labels); @@ -434,9 +426,8 @@ void classify_hists(int win_height, int win_width, int block_stride_y, int block template -__global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, - const int win_block_stride_y, const float* block_hists, - PtrElemStepf descriptors) +__global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, const int win_block_stride_y, + const float* block_hists, PtrElemStepf descriptors) { // Get left top corner of the window in src const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + @@ -455,9 +446,8 @@ __global__ void extract_descrs_by_rows_kernel(const int img_block_width, const i } -void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, - int win_stride_y, int win_stride_x, int height, int width, float* block_hists, - DevMem2Df descriptors) +void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x, + int height, int width, float* block_hists, DevMem2Df descriptors) { const int nthreads = 256; @@ -468,8 +458,7 @@ void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, i dim3 threads(nthreads, 1); dim3 grid(img_win_width, img_win_height); - int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / - block_stride_x; + int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x; extract_descrs_by_rows_kernel<<>>( img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors); cudaSafeCall( cudaGetLastError() ); @@ -640,21 +629,17 @@ void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2D& im const int nthreads = 256; dim3 bdim(nthreads, 1); - dim3 gdim(div_up(width, bdim.x), div_up(height, bdim.y)); + dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y)); if (correct_gamma) - compute_gradients_8UC4_kernel<<>>( - height, width, img, angle_scale, grad, qangle); + compute_gradients_8UC4_kernel<<>>(height, width, img, angle_scale, grad, qangle); else - compute_gradients_8UC4_kernel<<>>( - height, width, img, angle_scale, grad, qangle); + compute_gradients_8UC4_kernel<<>>(height, width, img, angle_scale, grad, qangle); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); } - template __global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrElemStep img, float angle_scale, PtrElemStepf grad, PtrElemStep qangle) @@ -715,17 +700,14 @@ void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2D& im const int nthreads = 256; dim3 bdim(nthreads, 1); - dim3 gdim(div_up(width, bdim.x), div_up(height, bdim.y)); + dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y)); if (correct_gamma) - compute_gradients_8UC1_kernel<<>>( - height, width, img, angle_scale, grad, qangle); + compute_gradients_8UC1_kernel<<>>(height, width, img, angle_scale, grad, qangle); else - compute_gradients_8UC1_kernel<<>>( - height, width, img, angle_scale, grad, qangle); + compute_gradients_8UC1_kernel<<>>(height, width, img, angle_scale, grad, qangle); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); } @@ -735,67 +717,58 @@ void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2D& im // Resize texture resize8UC4_tex; -texture resize8UC1_tex; - +texture resize8UC1_tex; -extern "C" __global__ void resize_8UC4_kernel(float sx, float sy, DevMem2D dst) +__global__ void resize_for_hog_kernel(float sx, float sy, DevMem2D_ dst, int colOfs) { unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; if (x < dst.cols && y < dst.rows) - { - float4 val = tex2D(resize8UC4_tex, x * sx, y * sy); - ((uchar4*)dst.ptr(y))[x] = make_uchar4(val.x * 255, val.y * 255, val.z * 255, val.w * 255); - } + ((unsigned char*)dst.ptr(y))[x] = tex2D(resize8UC1_tex, x * sx + colOfs, y * sy) * 255; } - -void resize_8UC4(const DevMem2D& src, DevMem2D dst) -{ - cudaChannelFormatDesc desc = cudaCreateChannelDesc(); - cudaBindTexture2D(0, resize8UC4_tex, src.data, desc, src.cols, src.rows, src.step); - resize8UC4_tex.filterMode = cudaFilterModeLinear; - - dim3 threads(32, 8); - dim3 grid(div_up(dst.cols, threads.x), div_up(dst.rows, threads.y)); - float sx = (float)src.cols / dst.cols; - float sy = (float)src.rows / dst.rows; - resize_8UC4_kernel<<>>(sx, sy, dst); - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall(cudaThreadSynchronize()); - - cudaSafeCall(cudaUnbindTexture(resize8UC4_tex)); -} - - -extern "C" __global__ void resize_8UC1_kernel(float sx, float sy, DevMem2D dst) +__global__ void resize_for_hog_kernel(float sx, float sy, DevMem2D_ dst, int colOfs) { unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; if (x < dst.cols && y < dst.rows) - ((unsigned char*)dst.ptr(y))[x] = tex2D(resize8UC1_tex, x * sx, y * sy) * 255; + { + float4 val = tex2D(resize8UC4_tex, x * sx + colOfs, y * sy); + dst.ptr(y)[x] = make_uchar4(val.x * 255, val.y * 255, val.z * 255, val.w * 255); + } } - -void resize_8UC1(const DevMem2D& src, DevMem2D dst) +template +static void resize_for_hog(const DevMem2D& src, DevMem2D dst, TEX& tex) { - cudaChannelFormatDesc desc = cudaCreateChannelDesc(); - cudaBindTexture2D(0, resize8UC1_tex, src.data, desc, src.cols, src.rows, src.step); - resize8UC1_tex.filterMode = cudaFilterModeLinear; + tex.filterMode = cudaFilterModeLinear; + + size_t texOfs = 0; + int colOfs = 0; + + cudaChannelFormatDesc desc = cudaCreateChannelDesc(); + cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) ); + + if (texOfs != 0) + { + colOfs = static_cast( texOfs/sizeof(T) ); + cudaSafeCall( cudaUnbindTexture(tex) ); + cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) ); + } dim3 threads(32, 8); - dim3 grid(div_up(dst.cols, threads.x), div_up(dst.rows, threads.y)); - float sx = (float)src.cols / dst.cols; - float sy = (float)src.rows / dst.rows; - resize_8UC1_kernel<<>>(sx, sy, dst); + dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y)); + float sx = static_cast(src.cols) / dst.cols; + float sy = static_cast(src.rows) / dst.rows; + resize_for_hog_kernel<<>>(sx, sy, (DevMem2D_)dst, colOfs); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); - cudaSafeCall(cudaUnbindTexture(resize8UC1_tex)); } +void resize_8UC1(const DevMem2D& src, DevMem2D dst) { resize_for_hog (src, dst, resize8UC1_tex); } +void resize_8UC4(const DevMem2D& src, DevMem2D dst) { resize_for_hog(src, dst, resize8UC4_tex); } + }}} \ No newline at end of file