From 89889ae8ea2cc5590ca58f5f029ad8e732dfbf64 Mon Sep 17 00:00:00 2001 From: ManuelFreudenreich Date: Thu, 11 Jun 2015 13:29:23 +0200 Subject: [PATCH] changed hog to work with variable parameters and changed the hog sample to test it with more options added comments and tests --- modules/cudaobjdetect/src/cuda/hog.cu | 205 +++++++++++------- modules/cudaobjdetect/src/hog.cpp | 61 ++++-- modules/cudaobjdetect/test/test_objdetect.cpp | 200 ++++++++++++++++- samples/gpu/hog.cpp | 114 ++++++++-- 4 files changed, 467 insertions(+), 113 deletions(-) diff --git a/modules/cudaobjdetect/src/cuda/hog.cu b/modules/cudaobjdetect/src/cuda/hog.cu index 1a9c1e6bc2..e5c7e8e9ed 100644 --- a/modules/cudaobjdetect/src/cuda/hog.cu +++ b/modules/cudaobjdetect/src/cuda/hog.cu @@ -49,11 +49,6 @@ namespace cv { namespace cuda { namespace device { - // Other values are not supported - #define CELL_WIDTH 8 - #define CELL_HEIGHT 8 - #define CELLS_PER_BLOCK_X 2 - #define CELLS_PER_BLOCK_Y 2 namespace hog { @@ -62,6 +57,8 @@ namespace cv { namespace cuda { namespace device __constant__ int cblock_stride_y; __constant__ int cnblocks_win_x; __constant__ int cnblocks_win_y; + __constant__ int cncells_block_x; + __constant__ int cncells_block_y; __constant__ int cblock_hist_size; __constant__ int cblock_hist_size_2up; __constant__ int cdescr_size; @@ -72,31 +69,47 @@ namespace cv { namespace cuda { namespace device the typical GPU thread count (pert block) values */ int power_2up(unsigned int n) { - if (n < 1) return 1; - else if (n < 2) return 2; - else if (n < 4) return 4; - else if (n < 8) return 8; - else if (n < 16) return 16; - else if (n < 32) return 32; - else if (n < 64) return 64; - else if (n < 128) return 128; - else if (n < 256) return 256; - else if (n < 512) return 512; - else if (n < 1024) return 1024; + if (n <= 1) return 1; + else if (n <= 2) return 2; + else if (n <= 4) return 4; + else if (n <= 8) return 8; + else if (n <= 16) return 16; + else if (n <= 32) return 32; + else if (n <= 64) return 64; + else if (n <= 128) return 128; + else if (n <= 256) return 256; + else if (n <= 512) return 512; + else if (n <= 1024) return 1024; return -1; // Input is too big } + /* Returns the max size for nblocks */ + int max_nblocks(int nthreads, int ncells_block = 1) + { + int threads = nthreads * ncells_block; + if(threads * 4 <= 256) + return 4; + else if(threads * 3 <= 256) + return 3; + else if(threads * 2 <= 256) + return 2; + else + return 1; + } + void set_up_constants(int nbins, int block_stride_x, int block_stride_y, - int nblocks_win_x, int nblocks_win_y) + int nblocks_win_x, int nblocks_win_y, int ncells_block_x, int ncells_block_y) { cudaSafeCall( cudaMemcpyToSymbol(cnbins, &nbins, sizeof(nbins)) ); cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_x, &block_stride_x, sizeof(block_stride_x)) ); cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_y, &block_stride_y, sizeof(block_stride_y)) ); cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_x, &nblocks_win_x, sizeof(nblocks_win_x)) ); cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_y, &nblocks_win_y, sizeof(nblocks_win_y)) ); + cudaSafeCall( cudaMemcpyToSymbol(cncells_block_x, &ncells_block_x, sizeof(ncells_block_x)) ); + cudaSafeCall( cudaMemcpyToSymbol(cncells_block_y, &ncells_block_y, sizeof(ncells_block_y)) ); - int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y; + int block_hist_size = nbins * ncells_block_x * ncells_block_y; cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size, &block_hist_size, sizeof(block_hist_size)) ); int block_hist_size_2up = power_2up(block_hist_size); @@ -112,44 +125,48 @@ namespace cv { namespace cuda { namespace device //---------------------------------------------------------------------------- // Histogram computation - - + // + // CUDA kernel to compute the histograms template // Number of histogram blocks processed by single GPU thread block __global__ void compute_hists_kernel_many_blocks(const int img_block_width, const PtrStepf grad, - const PtrStepb qangle, float scale, float* block_hists) + const PtrStepb qangle, float scale, float* block_hists, + int cell_size, int patch_size, int block_patch_size, + int threads_cell, int threads_block, int half_cell_size) { const int block_x = threadIdx.z; - const int cell_x = threadIdx.x / 16; + const int cell_x = threadIdx.x / threads_cell; const int cell_y = threadIdx.y; - const int cell_thread_x = threadIdx.x & 0xF; + const int cell_thread_x = threadIdx.x & (threads_cell - 1); if (blockIdx.x * blockDim.z + block_x >= img_block_width) return; extern __shared__ float smem[]; float* hists = smem; - float* final_hist = smem + cnbins * 48 * nblocks; + float* final_hist = smem + cnbins * block_patch_size * nblocks; - const int offset_x = (blockIdx.x * blockDim.z + block_x) * cblock_stride_x + - 4 * cell_x + cell_thread_x; - const int offset_y = blockIdx.y * cblock_stride_y + 4 * cell_y; + // patch_size means that patch_size pixels affect on block's cell + if (cell_thread_x < patch_size) + { + const int offset_x = (blockIdx.x * blockDim.z + block_x) * cblock_stride_x + + half_cell_size * cell_x + cell_thread_x; + const int offset_y = blockIdx.y * cblock_stride_y + half_cell_size * cell_y; - const float* grad_ptr = grad.ptr(offset_y) + offset_x * 2; - const unsigned char* qangle_ptr = qangle.ptr(offset_y) + offset_x * 2; + const float* grad_ptr = grad.ptr(offset_y) + offset_x * 2; + const unsigned char* qangle_ptr = qangle.ptr(offset_y) + offset_x * 2; - // 12 means that 12 pixels affect on block's cell (in one row) - if (cell_thread_x < 12) - { - float* hist = hists + 12 * (cell_y * blockDim.z * CELLS_PER_BLOCK_Y + - cell_x + block_x * CELLS_PER_BLOCK_X) + + + float* hist = hists + patch_size * (cell_y * blockDim.z * cncells_block_y + + cell_x + block_x * cncells_block_x) + cell_thread_x; for (int bin_id = 0; bin_id < cnbins; ++bin_id) - hist[bin_id * 48 * nblocks] = 0.f; + hist[bin_id * block_patch_size * nblocks] = 0.f; - const int dist_x = -4 + (int)cell_thread_x - 4 * cell_x; + //(dist_x, dist_y) : distance between current pixel in patch and cell's center + const int dist_x = -half_cell_size + (int)cell_thread_x - half_cell_size * cell_x; - const int dist_y_begin = -4 - 4 * (int)threadIdx.y; - for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y) + const int dist_y_begin = -half_cell_size - half_cell_size * (int)threadIdx.y; + for (int dist_y = dist_y_begin; dist_y < dist_y_begin + patch_size; ++dist_y) { float2 vote = *(const float2*)grad_ptr; uchar2 bin = *(const uchar2*)qangle_ptr; @@ -157,25 +174,29 @@ namespace cv { namespace cuda { namespace device grad_ptr += grad.step/sizeof(float); qangle_ptr += qangle.step; - int dist_center_y = dist_y - 4 * (1 - 2 * cell_y); - int dist_center_x = dist_x - 4 * (1 - 2 * cell_x); + //(dist_center_x, dist_center_y) : distance between current pixel in patch and block's center + int dist_center_y = dist_y - half_cell_size * (1 - 2 * cell_y); + int dist_center_x = dist_x - half_cell_size * (1 - 2 * cell_x); float gaussian = ::expf(-(dist_center_y * dist_center_y + dist_center_x * dist_center_x) * scale); - float interp_weight = (8.f - ::fabs(dist_y + 0.5f)) * - (8.f - ::fabs(dist_x + 0.5f)) / 64.f; - hist[bin.x * 48 * nblocks] += gaussian * interp_weight * vote.x; - hist[bin.y * 48 * nblocks] += gaussian * interp_weight * vote.y; + float interp_weight = ((float)cell_size - ::fabs(dist_y + 0.5f)) * + ((float)cell_size - ::fabs(dist_x + 0.5f)) / (float)threads_block; + + hist[bin.x * block_patch_size * nblocks] += gaussian * interp_weight * vote.x; + hist[bin.y * block_patch_size * nblocks] += gaussian * interp_weight * vote.y; } + //reduction of the histograms volatile float* hist_ = hist; - for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48 * nblocks) + for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += block_patch_size * nblocks) { - if (cell_thread_x < 6) hist_[0] += hist_[6]; - if (cell_thread_x < 3) hist_[0] += hist_[3]; + if (cell_thread_x < patch_size/2) hist_[0] += hist_[patch_size/2]; + if (cell_thread_x < patch_size/4 && (!((patch_size/4) < 3 && cell_thread_x == 0))) + hist_[0] += hist_[patch_size/4]; if (cell_thread_x == 0) - final_hist[((cell_x + block_x * 2) * 2 + cell_y) * cnbins + bin_id] + final_hist[((cell_x + block_x * cncells_block_x) * cncells_block_y + cell_y) * cnbins + bin_id] = hist_[0] + hist_[1] + hist_[2]; } } @@ -186,37 +207,69 @@ namespace cv { namespace cuda { namespace device blockIdx.x * blockDim.z + block_x) * cblock_hist_size; - int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 16 + cell_thread_x; + //copying from final_hist to block_hist + int tid; + if(threads_cell < cnbins) + { + tid = (cell_y * cncells_block_y + cell_x) * cnbins + cell_thread_x; + } else + { + tid = (cell_y * cncells_block_y + cell_x) * threads_cell + cell_thread_x; + } if (tid < cblock_hist_size) + { block_hist[tid] = final_hist[block_x * cblock_hist_size + tid]; + if(threads_cell < cnbins && cell_thread_x == (threads_cell-1)) + { + for(int i=1;i<=(cnbins - threads_cell);++i) + { + block_hist[tid + i] = final_hist[block_x * cblock_hist_size + tid + i]; + } + } + } } - + //declaration of variables and invoke the kernel with the calculated number of blocks void compute_hists(int nbins, int block_stride_x, int block_stride_y, int height, int width, const PtrStepSzf& grad, - const PtrStepSzb& qangle, float sigma, float* block_hists) + const PtrStepSzb& qangle, float sigma, float* block_hists, + int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y) { - const int nblocks = 1; - - int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / + const int ncells_block = ncells_block_x * ncells_block_y; + const int patch_side = cell_size_x / 4; + const int patch_size = cell_size_x + (patch_side * 2); + const int block_patch_size = ncells_block * patch_size; + const int threads_cell = power_2up(patch_size); + const int threads_block = ncells_block * threads_cell; + const int half_cell_size = cell_size_x / 2; + + int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) / block_stride_x; - int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / + int img_block_height = (height - ncells_block_y * cell_size_y + block_stride_y) / block_stride_y; + const int nblocks = max_nblocks(threads_cell, ncells_block); dim3 grid(divUp(img_block_width, nblocks), img_block_height); - dim3 threads(32, 2, nblocks); - - cudaSafeCall(cudaFuncSetCacheConfig(compute_hists_kernel_many_blocks, - cudaFuncCachePreferL1)); + dim3 threads(threads_cell * ncells_block_x, ncells_block_y, nblocks); // Precompute gaussian spatial window parameter float scale = 1.f / (2.f * sigma * sigma); - int hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12 * nblocks) * sizeof(float); - int final_hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * nblocks) * sizeof(float); + int hists_size = (nbins * ncells_block * patch_size * nblocks) * sizeof(float); + int final_hists_size = (nbins * ncells_block * nblocks) * sizeof(float); int smem = hists_size + final_hists_size; - compute_hists_kernel_many_blocks<<>>( - img_block_width, grad, qangle, scale, block_hists); + if (nblocks == 4) + compute_hists_kernel_many_blocks<4><<>>( + img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size); + else if (nblocks == 3) + compute_hists_kernel_many_blocks<3><<>>( + img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size); + else if (nblocks == 2) + compute_hists_kernel_many_blocks<2><<>>( + img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size); + else + compute_hists_kernel_many_blocks<1><<>>( + img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); @@ -293,16 +346,16 @@ namespace cv { namespace cuda { namespace device void normalize_hists(int nbins, int block_stride_x, int block_stride_y, - int height, int width, float* block_hists, float threshold) + int height, int width, float* block_hists, float threshold, int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y) { const int nblocks = 1; - int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y; + int block_hist_size = nbins * ncells_block_x * ncells_block_y; int nthreads = power_2up(block_hist_size); dim3 threads(nthreads, 1, nblocks); - int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x; - int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y; + int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) / block_stride_x; + int img_block_height = (height - ncells_block_y * cell_size_y + block_stride_y) / block_stride_y; dim3 grid(divUp(img_block_width, nblocks), img_block_height); if (nthreads == 32) @@ -310,7 +363,7 @@ namespace cv { namespace cuda { namespace device else if (nthreads == 64) normalize_hists_kernel_many_blocks<64, nblocks><<>>(block_hist_size, img_block_width, block_hists, threshold); else if (nthreads == 128) - normalize_hists_kernel_many_blocks<64, nblocks><<>>(block_hist_size, img_block_width, block_hists, threshold); + normalize_hists_kernel_many_blocks<128, nblocks><<>>(block_hist_size, img_block_width, block_hists, threshold); else if (nthreads == 256) normalize_hists_kernel_many_blocks<256, nblocks><<>>(block_hist_size, img_block_width, block_hists, threshold); else if (nthreads == 512) @@ -365,7 +418,7 @@ namespace cv { namespace cuda { namespace device void compute_confidence_hists(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x, int height, int width, float* block_hists, - float* coefs, float free_coef, float threshold, float *confidences) + float* coefs, float free_coef, float threshold, int cell_size_x, int ncells_block_x, float *confidences) { const int nthreads = 256; const int nblocks = 1; @@ -381,7 +434,7 @@ namespace cv { namespace cuda { namespace device cudaSafeCall(cudaFuncSetCacheConfig(compute_confidence_hists_kernel_many_blocks, cudaFuncCachePreferL1)); - int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / + int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) / block_stride_x; compute_confidence_hists_kernel_many_blocks<<>>( img_win_width, img_block_width, win_block_stride_x, win_block_stride_y, @@ -427,7 +480,7 @@ namespace cv { namespace cuda { namespace device void classify_hists(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x, int height, int width, float* block_hists, - float* coefs, float free_coef, float threshold, unsigned char* labels) + float* coefs, float free_coef, float threshold, int cell_size_x, int ncells_block_x, unsigned char* labels) { const int nthreads = 256; const int nblocks = 1; @@ -442,7 +495,7 @@ namespace cv { namespace cuda { namespace device cudaSafeCall(cudaFuncSetCacheConfig(classify_hists_kernel_many_blocks, cudaFuncCachePreferL1)); - int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x; + int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) / block_stride_x; classify_hists_kernel_many_blocks<<>>( img_win_width, img_block_width, win_block_stride_x, win_block_stride_y, block_hists, coefs, free_coef, threshold, labels); @@ -477,7 +530,7 @@ namespace cv { namespace cuda { namespace device void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x, - int height, int width, float* block_hists, PtrStepSzf descriptors) + int height, int width, float* block_hists, int cell_size_x, int ncells_block_x, PtrStepSzf descriptors) { const int nthreads = 256; @@ -488,7 +541,7 @@ namespace cv { namespace cuda { namespace device dim3 threads(nthreads, 1); dim3 grid(img_win_width, img_win_height); - int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x; + int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) / block_stride_x; extract_descrs_by_rows_kernel<<>>( img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors); cudaSafeCall( cudaGetLastError() ); @@ -525,7 +578,7 @@ namespace cv { namespace cuda { namespace device void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, - int win_stride_y, int win_stride_x, int height, int width, float* block_hists, + int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int cell_size_x, int ncells_block_x, PtrStepSzf descriptors) { const int nthreads = 256; @@ -537,7 +590,7 @@ namespace cv { namespace cuda { namespace device dim3 threads(nthreads, 1); dim3 grid(img_win_width, img_win_height); - int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x; + int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) / block_stride_x; extract_descrs_by_cols_kernel<<>>( img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors); cudaSafeCall( cudaGetLastError() ); diff --git a/modules/cudaobjdetect/src/hog.cpp b/modules/cudaobjdetect/src/hog.cpp index 1d465ff25c..3d3b5d336f 100644 --- a/modules/cudaobjdetect/src/hog.cpp +++ b/modules/cudaobjdetect/src/hog.cpp @@ -51,34 +51,45 @@ Ptr cv::cuda::HOG::create(Size, Size, Size, Size, int) { throw_no_cud #else +/****************************************************************************************\ + The code below is implementation of HOG (Histogram-of-Oriented Gradients) + descriptor and object detection, introduced by Navneet Dalal and Bill Triggs. + + The computed feature vectors are compatible with the + INRIA Object Detection and Localization Toolkit + (http://pascal.inrialpes.fr/soft/olt/) +\****************************************************************************************/ + namespace cv { namespace cuda { namespace device { namespace hog { void set_up_constants(int nbins, int block_stride_x, int block_stride_y, - int nblocks_win_x, int nblocks_win_y); + int nblocks_win_x, int nblocks_win_y, + int ncells_block_x, int ncells_block_y); - void compute_hists(int nbins, int block_stride_x, int blovck_stride_y, - int height, int width, const cv::cuda::PtrStepSzf& grad, - const cv::cuda::PtrStepSzb& qangle, float sigma, float* block_hists); + void compute_hists(int nbins, int block_stride_x, int block_stride_y, + int height, int width, const PtrStepSzf& grad, + const PtrStepSzb& qangle, float sigma, float* block_hists, + int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y); void normalize_hists(int nbins, int block_stride_x, int block_stride_y, - int height, int width, float* block_hists, float threshold); + int height, int width, float* block_hists, float threshold, int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y); void classify_hists(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x, int height, int width, float* block_hists, float* coefs, float free_coef, - float threshold, unsigned char* labels); + float threshold, int cell_size_x, int ncells_block_x, unsigned char* labels); void compute_confidence_hists(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x, int height, int width, float* block_hists, - float* coefs, float free_coef, float threshold, float *confidences); + float* coefs, float free_coef, float threshold, int cell_size_x, int ncells_block_x, float *confidences); void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, - int win_stride_y, int win_stride_x, int height, int width, float* block_hists, + int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int cell_size_x, int ncells_block_x, cv::cuda::PtrStepSzf descriptors); void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, - int win_stride_y, int win_stride_x, int height, int width, float* block_hists, + int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int cell_size_x, int ncells_block_x, cv::cuda::PtrStepSzf descriptors); void compute_gradients_8UC1(int nbins, int height, int width, const cv::cuda::PtrStepSzb& img, @@ -167,6 +178,7 @@ namespace double scale0_; int group_threshold_; int descr_format_; + Size cells_per_block_; private: int getTotalHistSize(Size img_size) const; @@ -197,7 +209,8 @@ namespace win_stride_(block_stride), scale0_(1.05), group_threshold_(2), - descr_format_(DESCR_FORMAT_COL_BY_COL) + descr_format_(DESCR_FORMAT_COL_BY_COL), + cells_per_block_(block_size.width / cell_size.width, block_size.height / cell_size.height) { CV_Assert((win_size.width - block_size.width ) % block_stride.width == 0 && (win_size.height - block_size.height) % block_stride.height == 0); @@ -205,12 +218,13 @@ namespace CV_Assert(block_size.width % cell_size.width == 0 && block_size.height % cell_size.height == 0); - CV_Assert(block_stride == cell_size); - - CV_Assert(cell_size == Size(8, 8)); + // Navneet Dalal and Bill Triggs. Histograms of oriented gradients for + // human detection. In International Conference on Computer Vision and + // Pattern Recognition, volume 2, pages 886–893, June 2005 + // http://lear.inrialpes.fr/people/triggs/pubs/Dalal-cvpr05.pdf (28.07.2015) [Figure 5] + CV_Assert(block_stride == (block_size / 2)); - Size cells_per_block(block_size.width / cell_size.width, block_size.height / cell_size.height); - CV_Assert(cells_per_block == Size(2, 2)); + CV_Assert(cell_size.width == cell_size.height); } static int numPartsWithin(int size, int part_size, int stride) @@ -231,8 +245,7 @@ namespace size_t HOG_Impl::getBlockHistogramSize() const { - Size cells_per_block(block_size_.width / cell_size_.width, block_size_.height / cell_size_.height); - return nbins_ * cells_per_block.area(); + return nbins_ * cells_per_block_.area(); } double HOG_Impl::getWinSigma() const @@ -313,6 +326,7 @@ namespace detector_.ptr(), (float)free_coef_, (float)hit_threshold_, + cell_size_.width, cells_per_block_.width, labels.ptr()); Mat labels_host; @@ -339,6 +353,7 @@ namespace detector_.ptr(), (float)free_coef_, (float)hit_threshold_, + cell_size_.width, cells_per_block_.width, labels.ptr()); Mat labels_host; @@ -465,6 +480,7 @@ namespace win_stride_.height, win_stride_.width, img.rows, img.cols, block_hists.ptr(), + cell_size_.width, cells_per_block_.width, descriptors); break; case DESCR_FORMAT_COL_BY_COL: @@ -473,6 +489,7 @@ namespace win_stride_.height, win_stride_.width, img.rows, img.cols, block_hists.ptr(), + cell_size_.width, cells_per_block_.width, descriptors); break; default: @@ -490,7 +507,7 @@ namespace void HOG_Impl::computeBlockHistograms(const GpuMat& img, GpuMat& block_hists) { cv::Size blocks_per_win = numPartsWithin(win_size_, block_size_, block_stride_); - hog::set_up_constants(nbins_, block_stride_.width, block_stride_.height, blocks_per_win.width, blocks_per_win.height); + hog::set_up_constants(nbins_, block_stride_.width, block_stride_.height, blocks_per_win.width, blocks_per_win.height, cells_per_block_.width, cells_per_block_.height); BufferPool pool(Stream::Null()); @@ -505,13 +522,17 @@ namespace img.rows, img.cols, grad, qangle, (float)getWinSigma(), - block_hists.ptr()); + block_hists.ptr(), + cell_size_.width, cell_size_.height, + cells_per_block_.width, cells_per_block_.height); hog::normalize_hists(nbins_, block_stride_.width, block_stride_.height, img.rows, img.cols, block_hists.ptr(), - (float)threshold_L2hys_); + (float)threshold_L2hys_, + cell_size_.width, cell_size_.height, + cells_per_block_.width, cells_per_block_.height); } void HOG_Impl::computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle) diff --git a/modules/cudaobjdetect/test/test_objdetect.cpp b/modules/cudaobjdetect/test/test_objdetect.cpp index 336d6e0718..25c3efddde 100644 --- a/modules/cudaobjdetect/test/test_objdetect.cpp +++ b/modules/cudaobjdetect/test/test_objdetect.cpp @@ -217,9 +217,9 @@ CUDA_TEST_P(HOG, GetDescriptors) r[(x * blocks_per_win_y + y) * block_hist_size + k]); } } - +/* INSTANTIATE_TEST_CASE_P(CUDA_ObjDetect, HOG, ALL_DEVICES); - +*/ //============== caltech hog tests =====================// struct CalTech : public ::testing::TestWithParam > @@ -269,8 +269,204 @@ INSTANTIATE_TEST_CASE_P(detect, CalTech, testing::Combine(ALL_DEVICES, "caltech/image_00000527_0.png", "caltech/image_00000574_0.png"))); +//------------------------variable GPU HOG Tests------------------------// +struct Hog_var : public ::testing::TestWithParam > +{ + cv::cuda::DeviceInfo devInfo; + cv::Mat img, c_img; + + virtual void SetUp() + { + devInfo = GET_PARAM(0); + cv::cuda::setDevice(devInfo.deviceID()); + + cv::Rect roi(0, 0, 16, 32); + img = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE); + ASSERT_FALSE(img.empty()); + c_img = img(roi); + } +}; + +CUDA_TEST_P(Hog_var, HOG) +{ + cv::cuda::GpuMat _img(c_img); + cv::cuda::GpuMat d_img; + + int win_stride_width = 8;int win_stride_height = 8; + int win_width = 16; + int block_width = 8; + int block_stride_width = 4;int block_stride_height = 4; + int cell_width = 4; + int nbins = 9; + + Size win_stride(win_stride_width, win_stride_height); + Size win_size(win_width, win_width * 2); + Size block_size(block_width, block_width); + Size block_stride(block_stride_width, block_stride_height); + Size cell_size(cell_width, cell_width); + + cv::Ptr gpu_hog = cv::cuda::HOG::create(win_size, block_size, block_stride, cell_size, nbins); + + gpu_hog->setNumLevels(13); + gpu_hog->setHitThreshold(0); + gpu_hog->setWinStride(win_stride); + gpu_hog->setScaleFactor(1.05); + gpu_hog->setGroupThreshold(8); + gpu_hog->compute(_img, d_img); + + vector gpu_desc_vec; + ASSERT_TRUE(gpu_desc_vec.empty()); + cv::Mat R(d_img); + + cv::HOGDescriptor cpu_hog(win_size, block_size, block_stride, cell_size, nbins); + cpu_hog.nlevels = 13; + vector cpu_desc_vec; + ASSERT_TRUE(cpu_desc_vec.empty()); + cpu_hog.compute(c_img, cpu_desc_vec, win_stride, Size(0,0)); +} + +INSTANTIATE_TEST_CASE_P(detect, Hog_var, testing::Combine(ALL_DEVICES, + ::testing::Values("/hog/road.png"))); + +struct Hog_var_cell : public ::testing::TestWithParam > +{ + cv::cuda::DeviceInfo devInfo; + cv::Mat img, c_img, c_img2, c_img3, c_img4; + + virtual void SetUp() + { + devInfo = GET_PARAM(0); + cv::cuda::setDevice(devInfo.deviceID()); + + cv::Rect roi(0, 0, 48, 96); + img = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE); + ASSERT_FALSE(img.empty()); + c_img = img(roi); + + cv::Rect roi2(0, 0, 54, 108); + c_img2 = img(roi2); + cv::Rect roi3(0, 0, 64, 128); + c_img3 = img(roi3); + + cv::Rect roi4(0, 0, 32, 64); + c_img4 = img(roi4); + } +}; + +CUDA_TEST_P(Hog_var_cell, HOG) +{ + cv::cuda::GpuMat _img(c_img); + cv::cuda::GpuMat _img2(c_img2); + cv::cuda::GpuMat _img3(c_img3); + cv::cuda::GpuMat _img4(c_img4); + cv::cuda::GpuMat d_img; + + ASSERT_FALSE(_img.empty()); + ASSERT_TRUE(d_img.empty()); + + int win_stride_width = 8;int win_stride_height = 8; + int win_width = 48; + int block_width = 16; + int block_stride_width = 8;int block_stride_height = 8; + int cell_width = 8; + int nbins = 9; + + Size win_stride(win_stride_width, win_stride_height); + Size win_size(win_width, win_width * 2); + Size block_size(block_width, block_width); + Size block_stride(block_stride_width, block_stride_height); + Size cell_size(cell_width, cell_width); + + cv::Ptr gpu_hog = cv::cuda::HOG::create(win_size, block_size, block_stride, cell_size, nbins); + + gpu_hog->setNumLevels(13); + gpu_hog->setHitThreshold(0); + gpu_hog->setWinStride(win_stride); + gpu_hog->setScaleFactor(1.05); + gpu_hog->setGroupThreshold(8); + gpu_hog->compute(_img, d_img); +//------------------------------------------------------------------------------ + cv::cuda::GpuMat d_img2; + ASSERT_TRUE(d_img2.empty()); + + int win_stride_width2 = 8;int win_stride_height2 = 8; + int win_width2 = 48; + int block_width2 = 16; + int block_stride_width2 = 8;int block_stride_height2 = 8; + int cell_width2 = 4; + + Size win_stride2(win_stride_width2, win_stride_height2); + Size win_size2(win_width2, win_width2 * 2); + Size block_size2(block_width2, block_width2); + Size block_stride2(block_stride_width2, block_stride_height2); + Size cell_size2(cell_width2, cell_width2); + + cv::Ptr gpu_hog2 = cv::cuda::HOG::create(win_size2, block_size2, block_stride2, cell_size2, nbins); + gpu_hog2->setWinStride(win_stride2); + gpu_hog2->compute(_img, d_img2); +//------------------------------------------------------------------------------ + cv::cuda::GpuMat d_img3; + ASSERT_TRUE(d_img3.empty()); + + int win_stride_width3 = 9;int win_stride_height3 = 9; + int win_width3 = 54; + int block_width3 = 18; + int block_stride_width3 = 9;int block_stride_height3 = 9; + int cell_width3 = 6; + + Size win_stride3(win_stride_width3, win_stride_height3); + Size win_size3(win_width3, win_width3 * 2); + Size block_size3(block_width3, block_width3); + Size block_stride3(block_stride_width3, block_stride_height3); + Size cell_size3(cell_width3, cell_width3); + + cv::Ptr gpu_hog3 = cv::cuda::HOG::create(win_size3, block_size3, block_stride3, cell_size3, nbins); + gpu_hog3->setWinStride(win_stride3); + gpu_hog3->compute(_img2, d_img3); +//------------------------------------------------------------------------------ + cv::cuda::GpuMat d_img4; + ASSERT_TRUE(d_img4.empty()); + + int win_stride_width4 = 16;int win_stride_height4 = 16; + int win_width4 = 64; + int block_width4 = 32; + int block_stride_width4 = 16;int block_stride_height4 = 16; + int cell_width4 = 8; + + Size win_stride4(win_stride_width4, win_stride_height4); + Size win_size4(win_width4, win_width4 * 2); + Size block_size4(block_width4, block_width4); + Size block_stride4(block_stride_width4, block_stride_height4); + Size cell_size4(cell_width4, cell_width4); + + cv::Ptr gpu_hog4 = cv::cuda::HOG::create(win_size4, block_size4, block_stride4, cell_size4, nbins); + gpu_hog4->setWinStride(win_stride4); + gpu_hog4->compute(_img3, d_img4); +//------------------------------------------------------------------------------ + cv::cuda::GpuMat d_img5; + ASSERT_TRUE(d_img5.empty()); + + int win_stride_width5 = 16;int win_stride_height5 = 16; + int win_width5 = 64; + int block_width5 = 32; + int block_stride_width5 = 16;int block_stride_height5 = 16; + int cell_width5 = 16; + + Size win_stride5(win_stride_width5, win_stride_height5); + Size win_size5(win_width5, win_width5 * 2); + Size block_size5(block_width5, block_width5); + Size block_stride5(block_stride_width5, block_stride_height5); + Size cell_size5(cell_width5, cell_width5); + + cv::Ptr gpu_hog5 = cv::cuda::HOG::create(win_size5, block_size5, block_stride5, cell_size5, nbins); + gpu_hog5->setWinStride(win_stride5); + gpu_hog5->compute(_img3, d_img5); +//------------------------------------------------------------------------------ +} +INSTANTIATE_TEST_CASE_P(detect, Hog_var_cell, testing::Combine(ALL_DEVICES, + ::testing::Values("/hog/road.png"))); ////////////////////////////////////////////////////////////////////////////////////////// /// LBP classifier diff --git a/samples/gpu/hog.cpp b/samples/gpu/hog.cpp index 8b57c89008..8f857da2ed 100644 --- a/samples/gpu/hog.cpp +++ b/samples/gpu/hog.cpp @@ -22,10 +22,14 @@ public: static Args read(int argc, char** argv); string src; + bool src_is_folder; bool src_is_video; bool src_is_camera; int camera_id; + bool svm_load; + string svm; + bool write_video; string dst_video; double dst_video_fps; @@ -44,6 +48,10 @@ public: int win_width; int win_stride_width, win_stride_height; + int block_width; + int block_stride_width, block_stride_height; + int cell_width; + int nbins; bool gamma_corr; }; @@ -93,6 +101,9 @@ static void printHelp() cout << "Histogram of Oriented Gradients descriptor and detector sample.\n" << "\nUsage: hog_gpu\n" << " (|--video |--camera ) # frames source\n" + << " or" + << " (--folder ) # load images from folder\n" + << " [--svm # load svm file" << " [--make_gray ] # convert image to gray one or not\n" << " [--resize_src ] # do resize of the source image or not\n" << " [--width ] # resized image width\n" @@ -100,9 +111,14 @@ static void printHelp() << " [--hit_threshold ] # classifying plane distance threshold (0.0 usually)\n" << " [--scale ] # HOG window scale factor\n" << " [--nlevels ] # max number of HOG window scales\n" - << " [--win_width ] # width of the window (48 or 64)\n" + << " [--win_width ] # width of the window\n" << " [--win_stride_width ] # distance by OX axis between neighbour wins\n" << " [--win_stride_height ] # distance by OY axis between neighbour wins\n" + << " [--block_width ] # width of the block\n" + << " [--block_stride_width ] # distance by 0X axis between neighbour blocks\n" + << " [--block_stride_height ] # distance by 0Y axis between neighbour blocks\n" + << " [--cell_width ] # width of the cell\n" + << " [--nbins ] # number of bins\n" << " [--gr_threshold ] # merging similar rects constant\n" << " [--gamma_correct ] # do gamma correction or not\n" << " [--write_video ] # write video or not\n" @@ -142,6 +158,8 @@ Args::Args() { src_is_video = false; src_is_camera = false; + src_is_folder = false; + svm_load = false; camera_id = 0; write_video = false; @@ -162,6 +180,11 @@ Args::Args() win_width = 48; win_stride_width = 8; win_stride_height = 8; + block_width = 16; + block_stride_width = 8; + block_stride_height = 8; + cell_width = 8; + nbins = 9; gamma_corr = true; } @@ -186,6 +209,11 @@ Args Args::read(int argc, char** argv) else if (string(argv[i]) == "--win_width") args.win_width = atoi(argv[++i]); else if (string(argv[i]) == "--win_stride_width") args.win_stride_width = atoi(argv[++i]); else if (string(argv[i]) == "--win_stride_height") args.win_stride_height = atoi(argv[++i]); + else if (string(argv[i]) == "--block_width") args.block_width = atoi(argv[++i]); + else if (string(argv[i]) == "--block_stride_width") args.block_stride_width = atoi(argv[++i]); + else if (string(argv[i]) == "--block_stride_height") args.block_stride_height = atoi(argv[++i]); + else if (string(argv[i]) == "--cell_width") args.cell_width = atoi(argv[++i]); + else if (string(argv[i]) == "--nbins") args.nbins = atoi(argv[++i]); else if (string(argv[i]) == "--gr_threshold") args.gr_threshold = atoi(argv[++i]); else if (string(argv[i]) == "--gamma_correct") args.gamma_corr = (string(argv[++i]) == "true"); else if (string(argv[i]) == "--write_video") args.write_video = (string(argv[++i]) == "true"); @@ -194,6 +222,8 @@ Args Args::read(int argc, char** argv) else if (string(argv[i]) == "--help") printHelp(); else if (string(argv[i]) == "--video") { args.src = argv[++i]; args.src_is_video = true; } else if (string(argv[i]) == "--camera") { args.camera_id = atoi(argv[++i]); args.src_is_camera = true; } + else if (string(argv[i]) == "--folder") { args.src = argv[++i]; args.src_is_folder = true;} + else if (string(argv[i]) == "--svm") { args.svm = argv[++i]; args.svm_load = true;} else if (args.src.empty()) args.src = argv[i]; else throw runtime_error((string("unknown key: ") + argv[i])); } @@ -228,16 +258,17 @@ App::App(const Args& s) gamma_corr = args.gamma_corr; - if (args.win_width != 64 && args.win_width != 48) - args.win_width = 64; - cout << "Scale: " << scale << endl; if (args.resize_src) cout << "Resized source: (" << args.width << ", " << args.height << ")\n"; cout << "Group threshold: " << gr_threshold << endl; cout << "Levels number: " << nlevels << endl; - cout << "Win width: " << args.win_width << endl; + cout << "Win size: (" << args.win_width << ", " << args.win_width*2 << ")\n"; cout << "Win stride: (" << args.win_stride_width << ", " << args.win_stride_height << ")\n"; + cout << "Block size: (" << args.block_width << ", " << args.block_width << ")\n"; + cout << "Block stride: (" << args.block_stride_width << ", " << args.block_stride_height << ")\n"; + cout << "Cell size: (" << args.cell_width << ", " << args.cell_width << ")\n"; + cout << "Bins number: " << args.nbins << endl; cout << "Hit threshold: " << hit_threshold << endl; cout << "Gamma correction: " << gamma_corr << endl; cout << endl; @@ -249,22 +280,58 @@ void App::run() running = true; cv::VideoWriter video_writer; - Size win_size(args.win_width, args.win_width * 2); //(64, 128) or (48, 96) Size win_stride(args.win_stride_width, args.win_stride_height); + Size win_size(args.win_width, args.win_width * 2); + Size block_size(args.block_width, args.block_width); + Size block_stride(args.block_stride_width, args.block_stride_height); + Size cell_size(args.cell_width, args.cell_width); + + cv::Ptr gpu_hog = cv::cuda::HOG::create(win_size, block_size, block_stride, cell_size, args.nbins); + cv::HOGDescriptor cpu_hog(win_size, block_size, block_stride, cell_size, args.nbins); + + if(args.svm_load) { + std::vector svm_model; + const std::string model_file_name = args.svm; + FileStorage ifs(model_file_name, FileStorage::READ); + if (ifs.isOpened()) { + ifs["svm_detector"] >> svm_model; + } else { + const std::string what = + "could not load model for hog classifier from file: " + + model_file_name; + throw std::runtime_error(what); + } - cv::Ptr gpu_hog = cv::cuda::HOG::create(win_size); - cv::HOGDescriptor cpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9); + // check if the variables are initialized + if (svm_model.empty()) { + const std::string what = + "HoG classifier: svm model could not be loaded from file" + + model_file_name; + throw std::runtime_error(what); + } + + gpu_hog->setSVMDetector(svm_model); + cpu_hog.setSVMDetector(svm_model); + } else { + // Create HOG descriptors and detectors here + Mat detector = gpu_hog->getDefaultPeopleDetector(); - // Create HOG descriptors and detectors here - Mat detector = gpu_hog->getDefaultPeopleDetector(); + gpu_hog->setSVMDetector(detector); + cpu_hog.setSVMDetector(detector); + } - gpu_hog->setSVMDetector(detector); - cpu_hog.setSVMDetector(detector); + cout << "gpusvmDescriptorSize : " << gpu_hog->getDescriptorSize() + << endl; + cout << "cpusvmDescriptorSize : " << cpu_hog.getDescriptorSize() + << endl; while (running) { VideoCapture vc; Mat frame; + vector filenames; + + unsigned int count = 1; if (args.src_is_video) { @@ -273,6 +340,14 @@ void App::run() throw runtime_error(string("can't open video file: " + args.src)); vc >> frame; } + else if (args.src_is_folder) { + String folder = args.src; + cout << folder << endl; + glob(folder, filenames); + frame = imread(filenames[count]); // 0 --> .gitignore + if (!frame.data) + cerr << "Problem loading image from folder!!!" << endl; + } else if (args.src_is_camera) { vc.open(args.camera_id); @@ -327,7 +402,7 @@ void App::run() { cpu_hog.nlevels = nlevels; cpu_hog.detectMultiScale(img, found, hit_threshold, win_stride, - Size(0, 0), scale, gr_threshold); + Size(0, 0), scale, gr_threshold); } hogWorkEnd(); @@ -342,11 +417,20 @@ void App::run() putText(img_to_show, "Mode: GPU", Point(5, 25), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2); else putText(img_to_show, "Mode: CPU", Point(5, 25), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2); - putText(img_to_show, "FPS (HOG only): " + hogWorkFps(), Point(5, 65), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2); - putText(img_to_show, "FPS (total): " + workFps(), Point(5, 105), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2); + putText(img_to_show, "FPS HOG: " + hogWorkFps(), Point(5, 65), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2); + putText(img_to_show, "FPS total: " + workFps(), Point(5, 105), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2); imshow("opencv_gpu_hog", img_to_show); if (args.src_is_video || args.src_is_camera) vc >> frame; + if (args.src_is_folder) { + count++; + if (count < filenames.size()) { + frame = imread(filenames[count]); + } else { + Mat empty; + frame = empty; + } + } workEnd();