From 35f66340d730a50a8a19e61016e9dabfa6696910 Mon Sep 17 00:00:00 2001 From: Claudio Date: Fri, 3 Feb 2017 16:00:16 +0100 Subject: [PATCH 1/3] Add cuda::Stream capability to cuda::HOG::compute In the previous version only the default stream was/could be used, i.e. cv::cuda::Stream::Null(). With this change, HOG::compute() will now run in parallel over different cuda::Streams. The code has been reordered so that all data allocation is completed first, then all the kernels are run in parallel over streams. Fix #8177 --- modules/cudaobjdetect/src/cuda/hog.cu | 77 +++++++++++------------- modules/cudaobjdetect/src/hog.cpp | 87 +++++++++++++-------------- 2 files changed, 77 insertions(+), 87 deletions(-) diff --git a/modules/cudaobjdetect/src/cuda/hog.cu b/modules/cudaobjdetect/src/cuda/hog.cu index c8609e7b03..d7dbd4a3d6 100644 --- a/modules/cudaobjdetect/src/cuda/hog.cu +++ b/modules/cudaobjdetect/src/cuda/hog.cu @@ -52,6 +52,7 @@ namespace cv { namespace cuda { namespace device namespace hog { + __constant__ int cnbins; __constant__ int cblock_stride_x; __constant__ int cblock_stride_y; @@ -99,27 +100,28 @@ namespace cv { namespace cuda { namespace device void set_up_constants(int nbins, int block_stride_x, int block_stride_y, - int nblocks_win_x, int nblocks_win_y, int ncells_block_x, int ncells_block_y) + int nblocks_win_x, int nblocks_win_y, int ncells_block_x, int ncells_block_y, + const cudaStream_t& stream) { - cudaSafeCall( cudaMemcpyToSymbol(cnbins, &nbins, sizeof(nbins)) ); - cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_x, &block_stride_x, sizeof(block_stride_x)) ); - cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_y, &block_stride_y, sizeof(block_stride_y)) ); - cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_x, &nblocks_win_x, sizeof(nblocks_win_x)) ); - cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_y, &nblocks_win_y, sizeof(nblocks_win_y)) ); - cudaSafeCall( cudaMemcpyToSymbol(cncells_block_x, &ncells_block_x, sizeof(ncells_block_x)) ); - cudaSafeCall( cudaMemcpyToSymbol(cncells_block_y, &ncells_block_y, sizeof(ncells_block_y)) ); + cudaSafeCall(cudaMemcpyToSymbolAsync(cnbins, &nbins, sizeof(nbins), 0, cudaMemcpyHostToDevice, stream)); + cudaSafeCall(cudaMemcpyToSymbolAsync(cblock_stride_x, &block_stride_x, sizeof(block_stride_x), 0, cudaMemcpyHostToDevice, stream)); + cudaSafeCall(cudaMemcpyToSymbolAsync(cblock_stride_y, &block_stride_y, sizeof(block_stride_y), 0, cudaMemcpyHostToDevice, stream)); + cudaSafeCall(cudaMemcpyToSymbolAsync(cnblocks_win_x, &nblocks_win_x, sizeof(nblocks_win_x), 0, cudaMemcpyHostToDevice, stream)); + cudaSafeCall(cudaMemcpyToSymbolAsync(cnblocks_win_y, &nblocks_win_y, sizeof(nblocks_win_y), 0, cudaMemcpyHostToDevice, stream)); + cudaSafeCall(cudaMemcpyToSymbolAsync(cncells_block_x, &ncells_block_x, sizeof(ncells_block_x), 0, cudaMemcpyHostToDevice, stream)); + cudaSafeCall(cudaMemcpyToSymbolAsync(cncells_block_y, &ncells_block_y, sizeof(ncells_block_y), 0, cudaMemcpyHostToDevice, stream)); int block_hist_size = nbins * ncells_block_x * ncells_block_y; - cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size, &block_hist_size, sizeof(block_hist_size)) ); + cudaSafeCall(cudaMemcpyToSymbolAsync(cblock_hist_size, &block_hist_size, sizeof(block_hist_size), 0, cudaMemcpyHostToDevice, stream)); int block_hist_size_2up = power_2up(block_hist_size); - cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size_2up, &block_hist_size_2up, sizeof(block_hist_size_2up)) ); + cudaSafeCall(cudaMemcpyToSymbolAsync(cblock_hist_size_2up, &block_hist_size_2up, sizeof(block_hist_size_2up), 0, cudaMemcpyHostToDevice, stream)); int descr_width = nblocks_win_x * block_hist_size; - cudaSafeCall( cudaMemcpyToSymbol(cdescr_width, &descr_width, sizeof(descr_width)) ); + cudaSafeCall(cudaMemcpyToSymbolAsync(cdescr_width, &descr_width, sizeof(descr_width), 0, cudaMemcpyHostToDevice, stream)); int descr_size = descr_width * nblocks_win_y; - cudaSafeCall( cudaMemcpyToSymbol(cdescr_size, &descr_size, sizeof(descr_size)) ); + cudaSafeCall(cudaMemcpyToSymbolAsync(cdescr_size, &descr_size, sizeof(descr_size), 0, cudaMemcpyHostToDevice, stream)); } @@ -233,7 +235,8 @@ namespace cv { namespace cuda { namespace device void compute_hists(int nbins, int block_stride_x, int block_stride_y, int height, int width, const PtrStepSzf& grad, const PtrStepSzb& qangle, float sigma, float* block_hists, - int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y) + int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y, + const cudaStream_t& stream) { const int ncells_block = ncells_block_x * ncells_block_y; const int patch_side = cell_size_x / 4; @@ -259,20 +262,15 @@ namespace cv { namespace cuda { namespace device int final_hists_size = (nbins * ncells_block * nblocks) * sizeof(float); int smem = hists_size + final_hists_size; if (nblocks == 4) - compute_hists_kernel_many_blocks<4><<>>( - img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size); + compute_hists_kernel_many_blocks<4><<>>(img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size); else if (nblocks == 3) - compute_hists_kernel_many_blocks<3><<>>( - img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size); + compute_hists_kernel_many_blocks<3><<>>(img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size); else if (nblocks == 2) - compute_hists_kernel_many_blocks<2><<>>( - img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size); + compute_hists_kernel_many_blocks<2><<>>(img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size); else - compute_hists_kernel_many_blocks<1><<>>( - img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size); - cudaSafeCall( cudaGetLastError() ); + compute_hists_kernel_many_blocks<1><<>>(img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size); - cudaSafeCall( cudaDeviceSynchronize() ); + cudaSafeCall( cudaGetLastError() ); } @@ -348,7 +346,8 @@ namespace cv { namespace cuda { namespace device void normalize_hists(int nbins, int block_stride_x, int block_stride_y, - int height, int width, float* block_hists, float threshold, int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y) + int height, int width, float* block_hists, float threshold, int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y, + const cudaStream_t& stream) { const int nblocks = 1; @@ -361,21 +360,19 @@ namespace cv { namespace cuda { namespace device dim3 grid(divUp(img_block_width, nblocks), img_block_height); if (nthreads == 32) - normalize_hists_kernel_many_blocks<32, nblocks><<>>(block_hist_size, img_block_width, block_hists, threshold); + normalize_hists_kernel_many_blocks<32, nblocks><<>>(block_hist_size, img_block_width, block_hists, threshold); else if (nthreads == 64) - normalize_hists_kernel_many_blocks<64, nblocks><<>>(block_hist_size, img_block_width, block_hists, threshold); + normalize_hists_kernel_many_blocks<64, nblocks><<>>(block_hist_size, img_block_width, block_hists, threshold); else if (nthreads == 128) - normalize_hists_kernel_many_blocks<128, nblocks><<>>(block_hist_size, img_block_width, block_hists, threshold); + normalize_hists_kernel_many_blocks<128, nblocks><<>>(block_hist_size, img_block_width, block_hists, threshold); else if (nthreads == 256) - normalize_hists_kernel_many_blocks<256, nblocks><<>>(block_hist_size, img_block_width, block_hists, threshold); + normalize_hists_kernel_many_blocks<256, nblocks><<>>(block_hist_size, img_block_width, block_hists, threshold); else if (nthreads == 512) - normalize_hists_kernel_many_blocks<512, nblocks><<>>(block_hist_size, img_block_width, block_hists, threshold); + normalize_hists_kernel_many_blocks<512, nblocks><<>>(block_hist_size, img_block_width, block_hists, threshold); else CV_Error(cv::Error::StsBadArg, "normalize_hists: histogram's size is too big, try to decrease number of bins"); cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaDeviceSynchronize() ); } @@ -581,7 +578,8 @@ namespace cv { namespace cuda { namespace device void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int cell_size_x, int ncells_block_x, - PtrStepSzf descriptors) + PtrStepSzf descriptors, + const cudaStream_t& stream) { const int nthreads = 256; @@ -593,11 +591,9 @@ namespace cv { namespace cuda { namespace device dim3 grid(img_win_width, img_win_height); int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) / block_stride_x; - extract_descrs_by_cols_kernel<<>>( - img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors); - cudaSafeCall( cudaGetLastError() ); + extract_descrs_by_cols_kernel<<>>(img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors); - cudaSafeCall( cudaDeviceSynchronize() ); + cudaSafeCall( cudaGetLastError() ); } //---------------------------------------------------------------------------- @@ -708,7 +704,8 @@ namespace cv { namespace cuda { namespace device void compute_gradients_8UC4(int nbins, int height, int width, const PtrStepSzb& img, - float angle_scale, PtrStepSzf grad, PtrStepSzb qangle, bool correct_gamma) + float angle_scale, PtrStepSzf grad, PtrStepSzb qangle, bool correct_gamma, + const cudaStream_t& stream) { (void)nbins; const int nthreads = 256; @@ -717,13 +714,11 @@ namespace cv { namespace cuda { namespace device dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y)); if (correct_gamma) - compute_gradients_8UC4_kernel<<>>(height, width, img, angle_scale, grad, qangle); + compute_gradients_8UC4_kernel<<>>(height, width, img, angle_scale, grad, qangle); else - compute_gradients_8UC4_kernel<<>>(height, width, img, angle_scale, grad, qangle); + compute_gradients_8UC4_kernel<<>>(height, width, img, angle_scale, grad, qangle); cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaDeviceSynchronize() ); } template diff --git a/modules/cudaobjdetect/src/hog.cpp b/modules/cudaobjdetect/src/hog.cpp index 3d3b5d336f..a9b4404a67 100644 --- a/modules/cudaobjdetect/src/hog.cpp +++ b/modules/cudaobjdetect/src/hog.cpp @@ -66,15 +66,18 @@ namespace cv { namespace cuda { namespace device { void set_up_constants(int nbins, int block_stride_x, int block_stride_y, int nblocks_win_x, int nblocks_win_y, - int ncells_block_x, int ncells_block_y); + int ncells_block_x, int ncells_block_y, + const cudaStream_t& stream); void compute_hists(int nbins, int block_stride_x, int block_stride_y, int height, int width, const PtrStepSzf& grad, const PtrStepSzb& qangle, float sigma, float* block_hists, - int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y); + int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y, + const cudaStream_t& stream); void normalize_hists(int nbins, int block_stride_x, int block_stride_y, - int height, int width, float* block_hists, float threshold, int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y); + int height, int width, float* block_hists, float threshold, int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y, + const cudaStream_t& stream); void classify_hists(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x, int height, @@ -90,12 +93,14 @@ namespace cv { namespace cuda { namespace device cv::cuda::PtrStepSzf descriptors); void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int cell_size_x, int ncells_block_x, - cv::cuda::PtrStepSzf descriptors); + cv::cuda::PtrStepSzf descriptors, + const cudaStream_t& stream); void compute_gradients_8UC1(int nbins, int height, int width, const cv::cuda::PtrStepSzb& img, float angle_scale, cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, bool correct_gamma); void compute_gradients_8UC4(int nbins, int height, int width, const cv::cuda::PtrStepSzb& img, - float angle_scale, cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, bool correct_gamma); + float angle_scale, cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, bool correct_gamma, + const cudaStream_t& stream); void resize_8UC1(const cv::cuda::PtrStepSzb& src, cv::cuda::PtrStepSzb dst); void resize_8UC4(const cv::cuda::PtrStepSzb& src, cv::cuda::PtrStepSzb dst); @@ -182,8 +187,8 @@ namespace private: int getTotalHistSize(Size img_size) const; - void computeBlockHistograms(const GpuMat& img, GpuMat& block_hists); - void computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle); + void computeBlockHistograms(const GpuMat& img, GpuMat& block_hists, Stream& stream); +// void computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle, Stream& stream); // Coefficients of the separating plane float free_coef_; @@ -310,7 +315,7 @@ namespace BufferPool pool(Stream::Null()); GpuMat block_hists = pool.getBuffer(1, getTotalHistSize(img.size()), CV_32FC1); - computeBlockHistograms(img, block_hists); + computeBlockHistograms(img, block_hists, Stream::Null()); Size wins_per_img = numPartsWithin(img.size(), win_size_, win_stride_); @@ -458,19 +463,16 @@ namespace CV_Assert( img.type() == CV_8UC1 || img.type() == CV_8UC4 ); CV_Assert( win_stride_.width % block_stride_.width == 0 && win_stride_.height % block_stride_.height == 0 ); - CV_Assert( !stream ); - - BufferPool pool(stream); - - GpuMat block_hists = pool.getBuffer(1, getTotalHistSize(img.size()), CV_32FC1); - computeBlockHistograms(img, block_hists); + BufferPool pool(stream); + GpuMat block_hists = pool.getBuffer(1, getTotalHistSize(img.size()), CV_32FC1); + Size wins_per_img = numPartsWithin(img.size(), win_size_, win_stride_); + Size blocks_per_win = numPartsWithin(win_size_, block_size_, block_stride_); const size_t block_hist_size = getBlockHistogramSize(); - Size blocks_per_win = numPartsWithin(win_size_, block_size_, block_stride_); - Size wins_per_img = numPartsWithin(img.size(), win_size_, win_stride_); - _descriptors.create(wins_per_img.area(), static_cast(blocks_per_win.area() * block_hist_size), CV_32FC1); - GpuMat descriptors = _descriptors.getGpuMat(); + GpuMat descriptors = _descriptors.getGpuMat(); + + computeBlockHistograms(img, block_hists, stream); switch (descr_format_) { @@ -490,7 +492,8 @@ namespace img.rows, img.cols, block_hists.ptr(), cell_size_.width, cells_per_block_.width, - descriptors); + descriptors, + StreamAccessor::getStream(stream)); break; default: CV_Error(cv::Error::StsBadArg, "Unknown descriptor format"); @@ -504,18 +507,25 @@ namespace return static_cast(block_hist_size * blocks_per_img.area()); } - void HOG_Impl::computeBlockHistograms(const GpuMat& img, GpuMat& block_hists) + void HOG_Impl::computeBlockHistograms(const GpuMat& img, GpuMat& block_hists, Stream& stream) { + BufferPool pool(stream); cv::Size blocks_per_win = numPartsWithin(win_size_, block_size_, block_stride_); - hog::set_up_constants(nbins_, block_stride_.width, block_stride_.height, blocks_per_win.width, blocks_per_win.height, cells_per_block_.width, cells_per_block_.height); - - BufferPool pool(Stream::Null()); + float angleScale = static_cast(nbins_ / CV_PI); + GpuMat grad = pool.getBuffer(img.size(), CV_32FC2); + GpuMat qangle = pool.getBuffer(img.size(), CV_8UC2); - GpuMat grad = pool.getBuffer(img.size(), CV_32FC2); - GpuMat qangle = pool.getBuffer(img.size(), CV_8UC2); - computeGradient(img, grad, qangle); + hog::set_up_constants(nbins_, block_stride_.width, block_stride_.height, blocks_per_win.width, blocks_per_win.height, cells_per_block_.width, cells_per_block_.height, StreamAccessor::getStream(stream)); - block_hists.create(1, getTotalHistSize(img.size()), CV_32FC1); + switch (img.type()) + { + case CV_8UC1: + hog::compute_gradients_8UC1(nbins_, img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction_); + break; + case CV_8UC4: + hog::compute_gradients_8UC4(nbins_, img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction_, StreamAccessor::getStream(stream)); + break; + } hog::compute_hists(nbins_, block_stride_.width, block_stride_.height, @@ -524,7 +534,8 @@ namespace (float)getWinSigma(), block_hists.ptr(), cell_size_.width, cell_size_.height, - cells_per_block_.width, cells_per_block_.height); + cells_per_block_.width, cells_per_block_.height, + StreamAccessor::getStream(stream)); hog::normalize_hists(nbins_, block_stride_.width, block_stride_.height, @@ -532,24 +543,8 @@ namespace block_hists.ptr(), (float)threshold_L2hys_, cell_size_.width, cell_size_.height, - cells_per_block_.width, cells_per_block_.height); - } - - void HOG_Impl::computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle) - { - grad.create(img.size(), CV_32FC2); - qangle.create(img.size(), CV_8UC2); - - float angleScale = (float)(nbins_ / CV_PI); - switch (img.type()) - { - case CV_8UC1: - hog::compute_gradients_8UC1(nbins_, img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction_); - break; - case CV_8UC4: - hog::compute_gradients_8UC4(nbins_, img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction_); - break; - } + cells_per_block_.width, cells_per_block_.height, + StreamAccessor::getStream(stream)); } } From dd3655f6a6e4c0af5d93d510ce686a0115315c25 Mon Sep 17 00:00:00 2001 From: Claudio Date: Sat, 25 Mar 2017 15:12:26 +0100 Subject: [PATCH 2/3] Align parameter code style between hog .cu and .cpp files --- modules/cudaobjdetect/src/cuda/hog.cu | 43 +++++++++++++++++++-------- modules/cudaobjdetect/src/hog.cpp | 16 ++++++++-- 2 files changed, 43 insertions(+), 16 deletions(-) diff --git a/modules/cudaobjdetect/src/cuda/hog.cu b/modules/cudaobjdetect/src/cuda/hog.cu index d7dbd4a3d6..40d2dec89c 100644 --- a/modules/cudaobjdetect/src/cuda/hog.cu +++ b/modules/cudaobjdetect/src/cuda/hog.cu @@ -52,7 +52,6 @@ namespace cv { namespace cuda { namespace device namespace hog { - __constant__ int cnbins; __constant__ int cblock_stride_x; __constant__ int cblock_stride_y; @@ -99,8 +98,10 @@ namespace cv { namespace cuda { namespace device } - void set_up_constants(int nbins, int block_stride_x, int block_stride_y, - int nblocks_win_x, int nblocks_win_y, int ncells_block_x, int ncells_block_y, + void set_up_constants(int nbins, + int block_stride_x, int block_stride_y, + int nblocks_win_x, int nblocks_win_y, + int ncells_block_x, int ncells_block_y, const cudaStream_t& stream) { cudaSafeCall(cudaMemcpyToSymbolAsync(cnbins, &nbins, sizeof(nbins), 0, cudaMemcpyHostToDevice, stream)); @@ -232,10 +233,14 @@ namespace cv { namespace cuda { namespace device } //declaration of variables and invoke the kernel with the calculated number of blocks - void compute_hists(int nbins, int block_stride_x, int block_stride_y, - int height, int width, const PtrStepSzf& grad, - const PtrStepSzb& qangle, float sigma, float* block_hists, - int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y, + void compute_hists(int nbins, + int block_stride_x, int block_stride_y, + int height, int width, + const PtrStepSzf& grad, const PtrStepSzb& qangle, + float sigma, + float* block_hists, + int cell_size_x, int cell_size_y, + int ncells_block_x, int ncells_block_y, const cudaStream_t& stream) { const int ncells_block = ncells_block_x * ncells_block_y; @@ -345,8 +350,13 @@ namespace cv { namespace cuda { namespace device } - void normalize_hists(int nbins, int block_stride_x, int block_stride_y, - int height, int width, float* block_hists, float threshold, int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y, + void normalize_hists(int nbins, + int block_stride_x, int block_stride_y, + int height, int width, + float* block_hists, + float threshold, + int cell_size_x, int cell_size_y, + int ncells_block_x, int ncells_block_y, const cudaStream_t& stream) { const int nblocks = 1; @@ -576,8 +586,12 @@ namespace cv { namespace cuda { namespace device } - void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, - int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int cell_size_x, int ncells_block_x, + void extract_descrs_by_cols(int win_height, int win_width, + int block_stride_y, int block_stride_x, + int win_stride_y, int win_stride_x, + int height, int width, + float* block_hists, + int cell_size_x, int ncells_block_x, PtrStepSzf descriptors, const cudaStream_t& stream) { @@ -703,8 +717,11 @@ namespace cv { namespace cuda { namespace device } - void compute_gradients_8UC4(int nbins, int height, int width, const PtrStepSzb& img, - float angle_scale, PtrStepSzf grad, PtrStepSzb qangle, bool correct_gamma, + void compute_gradients_8UC4(int nbins, + int height, int width, const PtrStepSzb& img, + float angle_scale, + PtrStepSzf grad, PtrStepSzb qangle, + bool correct_gamma, const cudaStream_t& stream) { (void)nbins; diff --git a/modules/cudaobjdetect/src/hog.cpp b/modules/cudaobjdetect/src/hog.cpp index a9b4404a67..f59bd0c545 100644 --- a/modules/cudaobjdetect/src/hog.cpp +++ b/modules/cudaobjdetect/src/hog.cpp @@ -515,15 +515,25 @@ namespace GpuMat grad = pool.getBuffer(img.size(), CV_32FC2); GpuMat qangle = pool.getBuffer(img.size(), CV_8UC2); - hog::set_up_constants(nbins_, block_stride_.width, block_stride_.height, blocks_per_win.width, blocks_per_win.height, cells_per_block_.width, cells_per_block_.height, StreamAccessor::getStream(stream)); + hog::set_up_constants(nbins_, + block_stride_.width, block_stride_.height, + blocks_per_win.width, blocks_per_win.height, + cells_per_block_.width, cells_per_block_.height, + StreamAccessor::getStream(stream)); switch (img.type()) { case CV_8UC1: - hog::compute_gradients_8UC1(nbins_, img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction_); + hog::compute_gradients_8UC1(nbins_, img.rows, img.cols, img, + angleScale, grad, qangle, gamma_correction_); break; case CV_8UC4: - hog::compute_gradients_8UC4(nbins_, img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction_, StreamAccessor::getStream(stream)); + hog::compute_gradients_8UC4(nbins_, + img.rows, img.cols, img, + angleScale, + grad, qangle, + gamma_correction_, + StreamAccessor::getStream(stream)); break; } From 4709b9d2d8904bd436c8ebb9fce5bf868ee3f078 Mon Sep 17 00:00:00 2001 From: Claudio Date: Sat, 25 Mar 2017 15:43:56 +0100 Subject: [PATCH 3/3] Add cuda::streams to by_rows and 8UC1 functions Fix #8177 --- modules/cudaobjdetect/src/cuda/hog.cu | 41 +++++++++------ modules/cudaobjdetect/src/hog.cpp | 75 +++++++++++++++++++-------- 2 files changed, 78 insertions(+), 38 deletions(-) diff --git a/modules/cudaobjdetect/src/cuda/hog.cu b/modules/cudaobjdetect/src/cuda/hog.cu index 40d2dec89c..45a3ecb838 100644 --- a/modules/cudaobjdetect/src/cuda/hog.cu +++ b/modules/cudaobjdetect/src/cuda/hog.cu @@ -518,8 +518,10 @@ namespace cv { namespace cuda { namespace device template - __global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, const int win_block_stride_y, - const float* block_hists, PtrStepf descriptors) + __global__ void extract_descrs_by_rows_kernel(const int img_block_width, + const int win_block_stride_x, const int win_block_stride_y, + const float* block_hists, + PtrStepf descriptors) { // Get left top corner of the window in src const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + @@ -538,8 +540,14 @@ namespace cv { namespace cuda { namespace device } - void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x, - int height, int width, float* block_hists, int cell_size_x, int ncells_block_x, PtrStepSzf descriptors) + void extract_descrs_by_rows(int win_height, int win_width, + int block_stride_y, int block_stride_x, + int win_stride_y, int win_stride_x, + int height, int width, + float* block_hists, int cell_size_x, + int ncells_block_x, + PtrStepSzf descriptors, + const cudaStream_t& stream) { const int nthreads = 256; @@ -551,17 +559,16 @@ namespace cv { namespace cuda { namespace device dim3 grid(img_win_width, img_win_height); int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) / block_stride_x; - extract_descrs_by_rows_kernel<<>>( - img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors); - cudaSafeCall( cudaGetLastError() ); + extract_descrs_by_rows_kernel<<>>(img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors); - cudaSafeCall( cudaDeviceSynchronize() ); + cudaSafeCall( cudaGetLastError() ); } template - __global__ void extract_descrs_by_cols_kernel(const int img_block_width, const int win_block_stride_x, - const int win_block_stride_y, const float* block_hists, + __global__ void extract_descrs_by_cols_kernel(const int img_block_width, + const int win_block_stride_x, const int win_block_stride_y, + const float* block_hists, PtrStepf descriptors) { // Get left top corner of the window in src @@ -792,8 +799,12 @@ namespace cv { namespace cuda { namespace device } - void compute_gradients_8UC1(int nbins, int height, int width, const PtrStepSzb& img, - float angle_scale, PtrStepSzf grad, PtrStepSzb qangle, bool correct_gamma) + void compute_gradients_8UC1(int nbins, + int height, int width, const PtrStepSzb& img, + float angle_scale, + PtrStepSzf grad, PtrStepSzb qangle, + bool correct_gamma, + const cudaStream_t& stream) { (void)nbins; const int nthreads = 256; @@ -802,13 +813,11 @@ namespace cv { namespace cuda { namespace device dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y)); if (correct_gamma) - compute_gradients_8UC1_kernel<<>>(height, width, img, angle_scale, grad, qangle); + compute_gradients_8UC1_kernel<<>>(height, width, img, angle_scale, grad, qangle); else - compute_gradients_8UC1_kernel<<>>(height, width, img, angle_scale, grad, qangle); + compute_gradients_8UC1_kernel<<>>(height, width, img, angle_scale, grad, qangle); cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaDeviceSynchronize() ); } diff --git a/modules/cudaobjdetect/src/hog.cpp b/modules/cudaobjdetect/src/hog.cpp index f59bd0c545..f06ce363bb 100644 --- a/modules/cudaobjdetect/src/hog.cpp +++ b/modules/cudaobjdetect/src/hog.cpp @@ -64,19 +64,29 @@ namespace cv { namespace cuda { namespace device { namespace hog { - void set_up_constants(int nbins, int block_stride_x, int block_stride_y, + void set_up_constants(int nbins, + int block_stride_x, int block_stride_y, int nblocks_win_x, int nblocks_win_y, int ncells_block_x, int ncells_block_y, const cudaStream_t& stream); - void compute_hists(int nbins, int block_stride_x, int block_stride_y, - int height, int width, const PtrStepSzf& grad, - const PtrStepSzb& qangle, float sigma, float* block_hists, - int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y, + void compute_hists(int nbins, + int block_stride_x, int block_stride_y, + int height, int width, + const PtrStepSzf& grad, const PtrStepSzb& qangle, + float sigma, + float* block_hists, + int cell_size_x, int cell_size_y, + int ncells_block_x, int ncells_block_y, const cudaStream_t& stream); - void normalize_hists(int nbins, int block_stride_x, int block_stride_y, - int height, int width, float* block_hists, float threshold, int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y, + void normalize_hists(int nbins, + int block_stride_x, int block_stride_y, + int height, int width, + float* block_hists, + float threshold, + int cell_size_x, int cell_size_y, + int ncells_block_x, int ncells_block_y, const cudaStream_t& stream); void classify_hists(int win_height, int win_width, int block_stride_y, @@ -85,21 +95,37 @@ namespace cv { namespace cuda { namespace device float threshold, int cell_size_x, int ncells_block_x, unsigned char* labels); void compute_confidence_hists(int win_height, int win_width, int block_stride_y, int block_stride_x, - int win_stride_y, int win_stride_x, int height, int width, float* block_hists, - float* coefs, float free_coef, float threshold, int cell_size_x, int ncells_block_x, float *confidences); - - void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, - int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int cell_size_x, int ncells_block_x, - cv::cuda::PtrStepSzf descriptors); - void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, - int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int cell_size_x, int ncells_block_x, + int win_stride_y, int win_stride_x, int height, int width, float* block_hists, + float* coefs, float free_coef, float threshold, int cell_size_x, int ncells_block_x, float *confidences); + + void extract_descrs_by_rows(int win_height, int win_width, + int block_stride_y, int block_stride_x, + int win_stride_y, int win_stride_x, + int height, int width, + float* block_hists, + int cell_size_x, int ncells_block_x, + cv::cuda::PtrStepSzf descriptors, + const cudaStream_t& stream); + void extract_descrs_by_cols(int win_height, int win_width, + int block_stride_y, int block_stride_x, + int win_stride_y, int win_stride_x, + int height, int width, + float* block_hists, + int cell_size_x, int ncells_block_x, cv::cuda::PtrStepSzf descriptors, const cudaStream_t& stream); - void compute_gradients_8UC1(int nbins, int height, int width, const cv::cuda::PtrStepSzb& img, - float angle_scale, cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, bool correct_gamma); - void compute_gradients_8UC4(int nbins, int height, int width, const cv::cuda::PtrStepSzb& img, - float angle_scale, cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, bool correct_gamma, + void compute_gradients_8UC1(int nbins, + int height, int width, const cv::cuda::PtrStepSzb& img, + float angle_scale, + cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, + bool correct_gamma, + const cudaStream_t& stream); + void compute_gradients_8UC4(int nbins, + int height, int width, const cv::cuda::PtrStepSzb& img, + float angle_scale, + cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, + bool correct_gamma, const cudaStream_t& stream); void resize_8UC1(const cv::cuda::PtrStepSzb& src, cv::cuda::PtrStepSzb dst); @@ -483,7 +509,8 @@ namespace img.rows, img.cols, block_hists.ptr(), cell_size_.width, cells_per_block_.width, - descriptors); + descriptors, + StreamAccessor::getStream(stream)); break; case DESCR_FORMAT_COL_BY_COL: hog::extract_descrs_by_cols(win_size_.height, win_size_.width, @@ -524,8 +551,12 @@ namespace switch (img.type()) { case CV_8UC1: - hog::compute_gradients_8UC1(nbins_, img.rows, img.cols, img, - angleScale, grad, qangle, gamma_correction_); + hog::compute_gradients_8UC1(nbins_, + img.rows, img.cols, img, + angleScale, + grad, qangle, + gamma_correction_, + StreamAccessor::getStream(stream)); break; case CV_8UC4: hog::compute_gradients_8UC4(nbins_,