diff --git a/modules/cudaobjdetect/src/cuda/hog.cu b/modules/cudaobjdetect/src/cuda/hog.cu index 40d2dec89c..45a3ecb838 100644 --- a/modules/cudaobjdetect/src/cuda/hog.cu +++ b/modules/cudaobjdetect/src/cuda/hog.cu @@ -518,8 +518,10 @@ namespace cv { namespace cuda { namespace device template - __global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, const int win_block_stride_y, - const float* block_hists, PtrStepf descriptors) + __global__ void extract_descrs_by_rows_kernel(const int img_block_width, + const int win_block_stride_x, const int win_block_stride_y, + const float* block_hists, + PtrStepf descriptors) { // Get left top corner of the window in src const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + @@ -538,8 +540,14 @@ namespace cv { namespace cuda { namespace device } - void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x, - int height, int width, float* block_hists, int cell_size_x, int ncells_block_x, PtrStepSzf descriptors) + void extract_descrs_by_rows(int win_height, int win_width, + int block_stride_y, int block_stride_x, + int win_stride_y, int win_stride_x, + int height, int width, + float* block_hists, int cell_size_x, + int ncells_block_x, + PtrStepSzf descriptors, + const cudaStream_t& stream) { const int nthreads = 256; @@ -551,17 +559,16 @@ namespace cv { namespace cuda { namespace device dim3 grid(img_win_width, img_win_height); int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) / block_stride_x; - extract_descrs_by_rows_kernel<<>>( - img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors); - cudaSafeCall( cudaGetLastError() ); + extract_descrs_by_rows_kernel<<>>(img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors); - cudaSafeCall( cudaDeviceSynchronize() ); + cudaSafeCall( cudaGetLastError() ); } template - __global__ void extract_descrs_by_cols_kernel(const int img_block_width, const int win_block_stride_x, - const int win_block_stride_y, const float* block_hists, + __global__ void extract_descrs_by_cols_kernel(const int img_block_width, + const int win_block_stride_x, const int win_block_stride_y, + const float* block_hists, PtrStepf descriptors) { // Get left top corner of the window in src @@ -792,8 +799,12 @@ namespace cv { namespace cuda { namespace device } - void compute_gradients_8UC1(int nbins, int height, int width, const PtrStepSzb& img, - float angle_scale, PtrStepSzf grad, PtrStepSzb qangle, bool correct_gamma) + void compute_gradients_8UC1(int nbins, + int height, int width, const PtrStepSzb& img, + float angle_scale, + PtrStepSzf grad, PtrStepSzb qangle, + bool correct_gamma, + const cudaStream_t& stream) { (void)nbins; const int nthreads = 256; @@ -802,13 +813,11 @@ namespace cv { namespace cuda { namespace device dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y)); if (correct_gamma) - compute_gradients_8UC1_kernel<<>>(height, width, img, angle_scale, grad, qangle); + compute_gradients_8UC1_kernel<<>>(height, width, img, angle_scale, grad, qangle); else - compute_gradients_8UC1_kernel<<>>(height, width, img, angle_scale, grad, qangle); + compute_gradients_8UC1_kernel<<>>(height, width, img, angle_scale, grad, qangle); cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaDeviceSynchronize() ); } diff --git a/modules/cudaobjdetect/src/hog.cpp b/modules/cudaobjdetect/src/hog.cpp index f59bd0c545..f06ce363bb 100644 --- a/modules/cudaobjdetect/src/hog.cpp +++ b/modules/cudaobjdetect/src/hog.cpp @@ -64,19 +64,29 @@ namespace cv { namespace cuda { namespace device { namespace hog { - void set_up_constants(int nbins, int block_stride_x, int block_stride_y, + void set_up_constants(int nbins, + int block_stride_x, int block_stride_y, int nblocks_win_x, int nblocks_win_y, int ncells_block_x, int ncells_block_y, const cudaStream_t& stream); - void compute_hists(int nbins, int block_stride_x, int block_stride_y, - int height, int width, const PtrStepSzf& grad, - const PtrStepSzb& qangle, float sigma, float* block_hists, - int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y, + void compute_hists(int nbins, + int block_stride_x, int block_stride_y, + int height, int width, + const PtrStepSzf& grad, const PtrStepSzb& qangle, + float sigma, + float* block_hists, + int cell_size_x, int cell_size_y, + int ncells_block_x, int ncells_block_y, const cudaStream_t& stream); - void normalize_hists(int nbins, int block_stride_x, int block_stride_y, - int height, int width, float* block_hists, float threshold, int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y, + void normalize_hists(int nbins, + int block_stride_x, int block_stride_y, + int height, int width, + float* block_hists, + float threshold, + int cell_size_x, int cell_size_y, + int ncells_block_x, int ncells_block_y, const cudaStream_t& stream); void classify_hists(int win_height, int win_width, int block_stride_y, @@ -85,21 +95,37 @@ namespace cv { namespace cuda { namespace device float threshold, int cell_size_x, int ncells_block_x, unsigned char* labels); void compute_confidence_hists(int win_height, int win_width, int block_stride_y, int block_stride_x, - int win_stride_y, int win_stride_x, int height, int width, float* block_hists, - float* coefs, float free_coef, float threshold, int cell_size_x, int ncells_block_x, float *confidences); - - void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, - int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int cell_size_x, int ncells_block_x, - cv::cuda::PtrStepSzf descriptors); - void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, - int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int cell_size_x, int ncells_block_x, + int win_stride_y, int win_stride_x, int height, int width, float* block_hists, + float* coefs, float free_coef, float threshold, int cell_size_x, int ncells_block_x, float *confidences); + + void extract_descrs_by_rows(int win_height, int win_width, + int block_stride_y, int block_stride_x, + int win_stride_y, int win_stride_x, + int height, int width, + float* block_hists, + int cell_size_x, int ncells_block_x, + cv::cuda::PtrStepSzf descriptors, + const cudaStream_t& stream); + void extract_descrs_by_cols(int win_height, int win_width, + int block_stride_y, int block_stride_x, + int win_stride_y, int win_stride_x, + int height, int width, + float* block_hists, + int cell_size_x, int ncells_block_x, cv::cuda::PtrStepSzf descriptors, const cudaStream_t& stream); - void compute_gradients_8UC1(int nbins, int height, int width, const cv::cuda::PtrStepSzb& img, - float angle_scale, cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, bool correct_gamma); - void compute_gradients_8UC4(int nbins, int height, int width, const cv::cuda::PtrStepSzb& img, - float angle_scale, cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, bool correct_gamma, + void compute_gradients_8UC1(int nbins, + int height, int width, const cv::cuda::PtrStepSzb& img, + float angle_scale, + cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, + bool correct_gamma, + const cudaStream_t& stream); + void compute_gradients_8UC4(int nbins, + int height, int width, const cv::cuda::PtrStepSzb& img, + float angle_scale, + cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, + bool correct_gamma, const cudaStream_t& stream); void resize_8UC1(const cv::cuda::PtrStepSzb& src, cv::cuda::PtrStepSzb dst); @@ -483,7 +509,8 @@ namespace img.rows, img.cols, block_hists.ptr(), cell_size_.width, cells_per_block_.width, - descriptors); + descriptors, + StreamAccessor::getStream(stream)); break; case DESCR_FORMAT_COL_BY_COL: hog::extract_descrs_by_cols(win_size_.height, win_size_.width, @@ -524,8 +551,12 @@ namespace switch (img.type()) { case CV_8UC1: - hog::compute_gradients_8UC1(nbins_, img.rows, img.cols, img, - angleScale, grad, qangle, gamma_correction_); + hog::compute_gradients_8UC1(nbins_, + img.rows, img.cols, img, + angleScale, + grad, qangle, + gamma_correction_, + StreamAccessor::getStream(stream)); break; case CV_8UC4: hog::compute_gradients_8UC4(nbins_,