From 35f66340d730a50a8a19e61016e9dabfa6696910 Mon Sep 17 00:00:00 2001
From: Claudio <claudio.fantacci@gmail.com>
Date: Fri, 3 Feb 2017 16:00:16 +0100
Subject: [PATCH 1/3] Add cuda::Stream capability to cuda::HOG::compute

In the previous version only the default stream was/could be used, i.e.
cv::cuda::Stream::Null().

With this change, HOG::compute() will now run in parallel over different
cuda::Streams.

The code has been reordered so that all data allocation is completed
first, then all the kernels are run in parallel over streams.

Fix #8177
---
 modules/cudaobjdetect/src/cuda/hog.cu | 77 +++++++++++-------------
 modules/cudaobjdetect/src/hog.cpp     | 87 +++++++++++++--------------
 2 files changed, 77 insertions(+), 87 deletions(-)

diff --git a/modules/cudaobjdetect/src/cuda/hog.cu b/modules/cudaobjdetect/src/cuda/hog.cu
index c8609e7b03..d7dbd4a3d6 100644
--- a/modules/cudaobjdetect/src/cuda/hog.cu
+++ b/modules/cudaobjdetect/src/cuda/hog.cu
@@ -52,6 +52,7 @@ namespace cv { namespace cuda { namespace device
 
     namespace hog
     {
+
         __constant__ int cnbins;
         __constant__ int cblock_stride_x;
         __constant__ int cblock_stride_y;
@@ -99,27 +100,28 @@ namespace cv { namespace cuda { namespace device
 
 
         void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
-                              int nblocks_win_x, int nblocks_win_y, int ncells_block_x, int ncells_block_y)
+                              int nblocks_win_x, int nblocks_win_y, int ncells_block_x, int ncells_block_y,
+                              const cudaStream_t& stream)
         {
-            cudaSafeCall( cudaMemcpyToSymbol(cnbins, &nbins, sizeof(nbins)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_x, &block_stride_x, sizeof(block_stride_x)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_y, &block_stride_y, sizeof(block_stride_y)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_x, &nblocks_win_x, sizeof(nblocks_win_x)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_y, &nblocks_win_y, sizeof(nblocks_win_y)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cncells_block_x, &ncells_block_x, sizeof(ncells_block_x)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cncells_block_y, &ncells_block_y, sizeof(ncells_block_y)) );
+            cudaSafeCall(cudaMemcpyToSymbolAsync(cnbins,               &nbins,               sizeof(nbins),               0, cudaMemcpyHostToDevice, stream));
+            cudaSafeCall(cudaMemcpyToSymbolAsync(cblock_stride_x,      &block_stride_x,      sizeof(block_stride_x),      0, cudaMemcpyHostToDevice, stream));
+            cudaSafeCall(cudaMemcpyToSymbolAsync(cblock_stride_y,      &block_stride_y,      sizeof(block_stride_y),      0, cudaMemcpyHostToDevice, stream));
+            cudaSafeCall(cudaMemcpyToSymbolAsync(cnblocks_win_x,       &nblocks_win_x,       sizeof(nblocks_win_x),       0, cudaMemcpyHostToDevice, stream));
+            cudaSafeCall(cudaMemcpyToSymbolAsync(cnblocks_win_y,       &nblocks_win_y,       sizeof(nblocks_win_y),       0, cudaMemcpyHostToDevice, stream));
+            cudaSafeCall(cudaMemcpyToSymbolAsync(cncells_block_x,      &ncells_block_x,      sizeof(ncells_block_x),      0, cudaMemcpyHostToDevice, stream));
+            cudaSafeCall(cudaMemcpyToSymbolAsync(cncells_block_y,      &ncells_block_y,      sizeof(ncells_block_y),      0, cudaMemcpyHostToDevice, stream));
 
             int block_hist_size = nbins * ncells_block_x * ncells_block_y;
-            cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size, &block_hist_size, sizeof(block_hist_size)) );
+            cudaSafeCall(cudaMemcpyToSymbolAsync(cblock_hist_size,     &block_hist_size,     sizeof(block_hist_size),     0, cudaMemcpyHostToDevice, stream));
 
             int block_hist_size_2up = power_2up(block_hist_size);
-            cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size_2up, &block_hist_size_2up, sizeof(block_hist_size_2up)) );
+            cudaSafeCall(cudaMemcpyToSymbolAsync(cblock_hist_size_2up, &block_hist_size_2up, sizeof(block_hist_size_2up), 0, cudaMemcpyHostToDevice, stream));
 
             int descr_width = nblocks_win_x * block_hist_size;
-            cudaSafeCall( cudaMemcpyToSymbol(cdescr_width, &descr_width, sizeof(descr_width)) );
+            cudaSafeCall(cudaMemcpyToSymbolAsync(cdescr_width,         &descr_width,         sizeof(descr_width),         0, cudaMemcpyHostToDevice, stream));
 
             int descr_size = descr_width * nblocks_win_y;
-            cudaSafeCall( cudaMemcpyToSymbol(cdescr_size, &descr_size, sizeof(descr_size)) );
+            cudaSafeCall(cudaMemcpyToSymbolAsync(cdescr_size,          &descr_size,          sizeof(descr_size),          0, cudaMemcpyHostToDevice, stream));
         }
 
 
@@ -233,7 +235,8 @@ namespace cv { namespace cuda { namespace device
         void compute_hists(int nbins, int block_stride_x, int block_stride_y,
                            int height, int width, const PtrStepSzf& grad,
                            const PtrStepSzb& qangle, float sigma, float* block_hists,
-                           int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y)
+                           int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y,
+                           const cudaStream_t& stream)
         {
             const int ncells_block = ncells_block_x * ncells_block_y;
             const int patch_side = cell_size_x / 4;
@@ -259,20 +262,15 @@ namespace cv { namespace cuda { namespace device
             int final_hists_size = (nbins * ncells_block * nblocks) * sizeof(float);
             int smem = hists_size + final_hists_size;
             if (nblocks == 4)
-                compute_hists_kernel_many_blocks<4><<<grid, threads, smem>>>(
-                    img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
+                compute_hists_kernel_many_blocks<4><<<grid, threads, smem, stream>>>(img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
             else if (nblocks == 3)
-                compute_hists_kernel_many_blocks<3><<<grid, threads, smem>>>(
-                    img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
+                compute_hists_kernel_many_blocks<3><<<grid, threads, smem, stream>>>(img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
             else if (nblocks == 2)
-                compute_hists_kernel_many_blocks<2><<<grid, threads, smem>>>(
-                    img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
+                compute_hists_kernel_many_blocks<2><<<grid, threads, smem, stream>>>(img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
             else
-                compute_hists_kernel_many_blocks<1><<<grid, threads, smem>>>(
-                    img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
-            cudaSafeCall( cudaGetLastError() );
+                compute_hists_kernel_many_blocks<1><<<grid, threads, smem, stream>>>(img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
 
-            cudaSafeCall( cudaDeviceSynchronize() );
+            cudaSafeCall( cudaGetLastError() );
         }
 
 
@@ -348,7 +346,8 @@ namespace cv { namespace cuda { namespace device
 
 
         void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
-                             int height, int width, float* block_hists, float threshold, int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y)
+                             int height, int width, float* block_hists, float threshold, int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y,
+                             const cudaStream_t& stream)
         {
             const int nblocks = 1;
 
@@ -361,21 +360,19 @@ namespace cv { namespace cuda { namespace device
             dim3 grid(divUp(img_block_width, nblocks), img_block_height);
 
             if (nthreads == 32)
-                normalize_hists_kernel_many_blocks<32, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
+                normalize_hists_kernel_many_blocks<32, nblocks><<<grid, threads, 0, stream>>>(block_hist_size, img_block_width, block_hists, threshold);
             else if (nthreads == 64)
-                normalize_hists_kernel_many_blocks<64, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
+                normalize_hists_kernel_many_blocks<64, nblocks><<<grid, threads, 0, stream>>>(block_hist_size, img_block_width, block_hists, threshold);
             else if (nthreads == 128)
-                normalize_hists_kernel_many_blocks<128, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
+                normalize_hists_kernel_many_blocks<128, nblocks><<<grid, threads, 0, stream>>>(block_hist_size, img_block_width, block_hists, threshold);
             else if (nthreads == 256)
-                normalize_hists_kernel_many_blocks<256, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
+                normalize_hists_kernel_many_blocks<256, nblocks><<<grid, threads, 0, stream>>>(block_hist_size, img_block_width, block_hists, threshold);
             else if (nthreads == 512)
-                normalize_hists_kernel_many_blocks<512, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
+                normalize_hists_kernel_many_blocks<512, nblocks><<<grid, threads, 0, stream>>>(block_hist_size, img_block_width, block_hists, threshold);
             else
                 CV_Error(cv::Error::StsBadArg, "normalize_hists: histogram's size is too big, try to decrease number of bins");
 
             cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
         }
 
 
@@ -581,7 +578,8 @@ namespace cv { namespace cuda { namespace device
 
         void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
                                     int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int cell_size_x, int ncells_block_x,
-                                    PtrStepSzf descriptors)
+                                    PtrStepSzf descriptors,
+                                    const cudaStream_t& stream)
         {
             const int nthreads = 256;
 
@@ -593,11 +591,9 @@ namespace cv { namespace cuda { namespace device
             dim3 grid(img_win_width, img_win_height);
 
             int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) / block_stride_x;
-            extract_descrs_by_cols_kernel<nthreads><<<grid, threads>>>(
-                img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
-            cudaSafeCall( cudaGetLastError() );
+            extract_descrs_by_cols_kernel<nthreads><<<grid, threads, 0, stream>>>(img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
 
-            cudaSafeCall( cudaDeviceSynchronize() );
+            cudaSafeCall( cudaGetLastError() );
         }
 
         //----------------------------------------------------------------------------
@@ -708,7 +704,8 @@ namespace cv { namespace cuda { namespace device
 
 
         void compute_gradients_8UC4(int nbins, int height, int width, const PtrStepSzb& img,
-                                    float angle_scale, PtrStepSzf grad, PtrStepSzb qangle, bool correct_gamma)
+                                    float angle_scale, PtrStepSzf grad, PtrStepSzb qangle, bool correct_gamma,
+                                    const cudaStream_t& stream)
         {
             (void)nbins;
             const int nthreads = 256;
@@ -717,13 +714,11 @@ namespace cv { namespace cuda { namespace device
             dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y));
 
             if (correct_gamma)
-                compute_gradients_8UC4_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
+                compute_gradients_8UC4_kernel<nthreads, 1><<<gdim, bdim, 0, stream>>>(height, width, img, angle_scale, grad, qangle);
             else
-                compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
+                compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim, 0, stream>>>(height, width, img, angle_scale, grad, qangle);
 
             cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
         }
 
         template <int nthreads, int correct_gamma>
diff --git a/modules/cudaobjdetect/src/hog.cpp b/modules/cudaobjdetect/src/hog.cpp
index 3d3b5d336f..a9b4404a67 100644
--- a/modules/cudaobjdetect/src/hog.cpp
+++ b/modules/cudaobjdetect/src/hog.cpp
@@ -66,15 +66,18 @@ namespace cv { namespace cuda { namespace device
     {
         void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
                               int nblocks_win_x, int nblocks_win_y,
-                              int ncells_block_x, int ncells_block_y);
+                              int ncells_block_x, int ncells_block_y,
+                              const cudaStream_t& stream);
 
         void compute_hists(int nbins, int block_stride_x, int block_stride_y,
                            int height, int width, const PtrStepSzf& grad,
                            const PtrStepSzb& qangle, float sigma, float* block_hists,
-                           int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y);
+                           int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y,
+                           const cudaStream_t& stream);
 
         void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
-                             int height, int width, float* block_hists, float threshold, int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y);
+                             int height, int width, float* block_hists, float threshold, int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y,
+                             const cudaStream_t& stream);
 
         void classify_hists(int win_height, int win_width, int block_stride_y,
                             int block_stride_x, int win_stride_y, int win_stride_x, int height,
@@ -90,12 +93,14 @@ namespace cv { namespace cuda { namespace device
                                     cv::cuda::PtrStepSzf descriptors);
         void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
                                     int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int cell_size_x, int ncells_block_x,
-                                    cv::cuda::PtrStepSzf descriptors);
+                                    cv::cuda::PtrStepSzf descriptors,
+                                    const cudaStream_t& stream);
 
         void compute_gradients_8UC1(int nbins, int height, int width, const cv::cuda::PtrStepSzb& img,
                                     float angle_scale, cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, bool correct_gamma);
         void compute_gradients_8UC4(int nbins, int height, int width, const cv::cuda::PtrStepSzb& img,
-                                    float angle_scale, cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, bool correct_gamma);
+                                    float angle_scale, cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, bool correct_gamma,
+                                    const cudaStream_t& stream);
 
         void resize_8UC1(const cv::cuda::PtrStepSzb& src, cv::cuda::PtrStepSzb dst);
         void resize_8UC4(const cv::cuda::PtrStepSzb& src, cv::cuda::PtrStepSzb dst);
@@ -182,8 +187,8 @@ namespace
 
     private:
         int getTotalHistSize(Size img_size) const;
-        void computeBlockHistograms(const GpuMat& img, GpuMat& block_hists);
-        void computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle);
+        void computeBlockHistograms(const GpuMat& img, GpuMat& block_hists, Stream& stream);
+//        void computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle, Stream& stream);
 
         // Coefficients of the separating plane
         float free_coef_;
@@ -310,7 +315,7 @@ namespace
         BufferPool pool(Stream::Null());
 
         GpuMat block_hists = pool.getBuffer(1, getTotalHistSize(img.size()), CV_32FC1);
-        computeBlockHistograms(img, block_hists);
+        computeBlockHistograms(img, block_hists, Stream::Null());
 
         Size wins_per_img = numPartsWithin(img.size(), win_size_, win_stride_);
 
@@ -458,19 +463,16 @@ namespace
 
         CV_Assert( img.type() == CV_8UC1 || img.type() == CV_8UC4 );
         CV_Assert( win_stride_.width % block_stride_.width == 0 && win_stride_.height % block_stride_.height == 0 );
-        CV_Assert( !stream );
-
-        BufferPool pool(stream);
-
-        GpuMat block_hists = pool.getBuffer(1, getTotalHistSize(img.size()), CV_32FC1);
-        computeBlockHistograms(img, block_hists);
 
+        BufferPool   pool(stream);
+        GpuMat       block_hists     = pool.getBuffer(1, getTotalHistSize(img.size()), CV_32FC1);
+        Size         wins_per_img    = numPartsWithin(img.size(), win_size_, win_stride_);
+        Size         blocks_per_win  = numPartsWithin(win_size_, block_size_, block_stride_);
         const size_t block_hist_size = getBlockHistogramSize();
-        Size blocks_per_win = numPartsWithin(win_size_, block_size_, block_stride_);
-        Size wins_per_img   = numPartsWithin(img.size(), win_size_, win_stride_);
-
         _descriptors.create(wins_per_img.area(), static_cast<int>(blocks_per_win.area() * block_hist_size), CV_32FC1);
-        GpuMat descriptors = _descriptors.getGpuMat();
+        GpuMat       descriptors     = _descriptors.getGpuMat();
+
+        computeBlockHistograms(img, block_hists, stream);
 
         switch (descr_format_)
         {
@@ -490,7 +492,8 @@ namespace
                                         img.rows, img.cols,
                                         block_hists.ptr<float>(),
                                         cell_size_.width, cells_per_block_.width,
-                                        descriptors);
+                                        descriptors,
+                                        StreamAccessor::getStream(stream));
             break;
         default:
             CV_Error(cv::Error::StsBadArg, "Unknown descriptor format");
@@ -504,18 +507,25 @@ namespace
         return static_cast<int>(block_hist_size * blocks_per_img.area());
     }
 
-    void HOG_Impl::computeBlockHistograms(const GpuMat& img, GpuMat& block_hists)
+    void HOG_Impl::computeBlockHistograms(const GpuMat& img, GpuMat& block_hists, Stream& stream)
     {
+        BufferPool pool(stream);
         cv::Size blocks_per_win = numPartsWithin(win_size_, block_size_, block_stride_);
-        hog::set_up_constants(nbins_, block_stride_.width, block_stride_.height, blocks_per_win.width, blocks_per_win.height, cells_per_block_.width, cells_per_block_.height);
-
-        BufferPool pool(Stream::Null());
+        float  angleScale = static_cast<float>(nbins_ / CV_PI);
+        GpuMat grad       = pool.getBuffer(img.size(), CV_32FC2);
+        GpuMat qangle     = pool.getBuffer(img.size(), CV_8UC2);
 
-        GpuMat grad = pool.getBuffer(img.size(), CV_32FC2);
-        GpuMat qangle = pool.getBuffer(img.size(), CV_8UC2);
-        computeGradient(img, grad, qangle);
+        hog::set_up_constants(nbins_, block_stride_.width, block_stride_.height, blocks_per_win.width, blocks_per_win.height, cells_per_block_.width, cells_per_block_.height, StreamAccessor::getStream(stream));
 
-        block_hists.create(1, getTotalHistSize(img.size()), CV_32FC1);
+        switch (img.type())
+        {
+            case CV_8UC1:
+                hog::compute_gradients_8UC1(nbins_, img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction_);
+                break;
+            case CV_8UC4:
+                hog::compute_gradients_8UC4(nbins_, img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction_, StreamAccessor::getStream(stream));
+                break;
+        }
 
         hog::compute_hists(nbins_,
                            block_stride_.width, block_stride_.height,
@@ -524,7 +534,8 @@ namespace
                            (float)getWinSigma(),
                            block_hists.ptr<float>(),
                            cell_size_.width, cell_size_.height,
-                           cells_per_block_.width, cells_per_block_.height);
+                           cells_per_block_.width, cells_per_block_.height,
+                           StreamAccessor::getStream(stream));
 
         hog::normalize_hists(nbins_,
                              block_stride_.width, block_stride_.height,
@@ -532,24 +543,8 @@ namespace
                              block_hists.ptr<float>(),
                              (float)threshold_L2hys_,
                              cell_size_.width, cell_size_.height,
-                             cells_per_block_.width, cells_per_block_.height);
-    }
-
-    void HOG_Impl::computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle)
-    {
-        grad.create(img.size(), CV_32FC2);
-        qangle.create(img.size(), CV_8UC2);
-
-        float angleScale = (float)(nbins_ / CV_PI);
-        switch (img.type())
-        {
-            case CV_8UC1:
-                hog::compute_gradients_8UC1(nbins_, img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction_);
-                break;
-            case CV_8UC4:
-                hog::compute_gradients_8UC4(nbins_, img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction_);
-                break;
-        }
+                             cells_per_block_.width, cells_per_block_.height,
+                             StreamAccessor::getStream(stream));
     }
 }
 

From dd3655f6a6e4c0af5d93d510ce686a0115315c25 Mon Sep 17 00:00:00 2001
From: Claudio <claudio.fantacci@gmail.com>
Date: Sat, 25 Mar 2017 15:12:26 +0100
Subject: [PATCH 2/3] Align parameter code style between hog .cu and .cpp files

---
 modules/cudaobjdetect/src/cuda/hog.cu | 43 +++++++++++++++++++--------
 modules/cudaobjdetect/src/hog.cpp     | 16 ++++++++--
 2 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/modules/cudaobjdetect/src/cuda/hog.cu b/modules/cudaobjdetect/src/cuda/hog.cu
index d7dbd4a3d6..40d2dec89c 100644
--- a/modules/cudaobjdetect/src/cuda/hog.cu
+++ b/modules/cudaobjdetect/src/cuda/hog.cu
@@ -52,7 +52,6 @@ namespace cv { namespace cuda { namespace device
 
     namespace hog
     {
-
         __constant__ int cnbins;
         __constant__ int cblock_stride_x;
         __constant__ int cblock_stride_y;
@@ -99,8 +98,10 @@ namespace cv { namespace cuda { namespace device
         }
 
 
-        void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
-                              int nblocks_win_x, int nblocks_win_y, int ncells_block_x, int ncells_block_y,
+        void set_up_constants(int nbins,
+                              int block_stride_x, int block_stride_y,
+                              int nblocks_win_x, int nblocks_win_y,
+                              int ncells_block_x, int ncells_block_y,
                               const cudaStream_t& stream)
         {
             cudaSafeCall(cudaMemcpyToSymbolAsync(cnbins,               &nbins,               sizeof(nbins),               0, cudaMemcpyHostToDevice, stream));
@@ -232,10 +233,14 @@ namespace cv { namespace cuda { namespace device
         }
 
         //declaration of variables and invoke the kernel with the calculated number of blocks
-        void compute_hists(int nbins, int block_stride_x, int block_stride_y,
-                           int height, int width, const PtrStepSzf& grad,
-                           const PtrStepSzb& qangle, float sigma, float* block_hists,
-                           int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y,
+        void compute_hists(int nbins,
+                           int block_stride_x, int block_stride_y,
+                           int height, int width,
+                           const PtrStepSzf& grad, const PtrStepSzb& qangle,
+                           float sigma,
+                           float* block_hists,
+                           int cell_size_x, int cell_size_y,
+                           int ncells_block_x, int ncells_block_y,
                            const cudaStream_t& stream)
         {
             const int ncells_block = ncells_block_x * ncells_block_y;
@@ -345,8 +350,13 @@ namespace cv { namespace cuda { namespace device
         }
 
 
-        void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
-                             int height, int width, float* block_hists, float threshold, int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y,
+        void normalize_hists(int nbins,
+                             int block_stride_x, int block_stride_y,
+                             int height, int width,
+                             float* block_hists,
+                             float threshold,
+                             int cell_size_x, int cell_size_y,
+                             int ncells_block_x, int ncells_block_y,
                              const cudaStream_t& stream)
         {
             const int nblocks = 1;
@@ -576,8 +586,12 @@ namespace cv { namespace cuda { namespace device
         }
 
 
-        void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
-                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int cell_size_x, int ncells_block_x,
+        void extract_descrs_by_cols(int win_height, int win_width,
+                                    int block_stride_y, int block_stride_x,
+                                    int win_stride_y, int win_stride_x,
+                                    int height, int width,
+                                    float* block_hists,
+                                    int cell_size_x, int ncells_block_x,
                                     PtrStepSzf descriptors,
                                     const cudaStream_t& stream)
         {
@@ -703,8 +717,11 @@ namespace cv { namespace cuda { namespace device
         }
 
 
-        void compute_gradients_8UC4(int nbins, int height, int width, const PtrStepSzb& img,
-                                    float angle_scale, PtrStepSzf grad, PtrStepSzb qangle, bool correct_gamma,
+        void compute_gradients_8UC4(int nbins,
+                                    int height, int width, const PtrStepSzb& img,
+                                    float angle_scale,
+                                    PtrStepSzf grad, PtrStepSzb qangle,
+                                    bool correct_gamma,
                                     const cudaStream_t& stream)
         {
             (void)nbins;
diff --git a/modules/cudaobjdetect/src/hog.cpp b/modules/cudaobjdetect/src/hog.cpp
index a9b4404a67..f59bd0c545 100644
--- a/modules/cudaobjdetect/src/hog.cpp
+++ b/modules/cudaobjdetect/src/hog.cpp
@@ -515,15 +515,25 @@ namespace
         GpuMat grad       = pool.getBuffer(img.size(), CV_32FC2);
         GpuMat qangle     = pool.getBuffer(img.size(), CV_8UC2);
 
-        hog::set_up_constants(nbins_, block_stride_.width, block_stride_.height, blocks_per_win.width, blocks_per_win.height, cells_per_block_.width, cells_per_block_.height, StreamAccessor::getStream(stream));
+        hog::set_up_constants(nbins_,
+                              block_stride_.width, block_stride_.height,
+                              blocks_per_win.width, blocks_per_win.height,
+                              cells_per_block_.width, cells_per_block_.height,
+                              StreamAccessor::getStream(stream));
 
         switch (img.type())
         {
             case CV_8UC1:
-                hog::compute_gradients_8UC1(nbins_, img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction_);
+                hog::compute_gradients_8UC1(nbins_, img.rows, img.cols, img,
+                                            angleScale, grad, qangle, gamma_correction_);
                 break;
             case CV_8UC4:
-                hog::compute_gradients_8UC4(nbins_, img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction_, StreamAccessor::getStream(stream));
+                hog::compute_gradients_8UC4(nbins_,
+                                            img.rows, img.cols, img,
+                                            angleScale,
+                                            grad, qangle,
+                                            gamma_correction_,
+                                            StreamAccessor::getStream(stream));
                 break;
         }
 

From 4709b9d2d8904bd436c8ebb9fce5bf868ee3f078 Mon Sep 17 00:00:00 2001
From: Claudio <claudio.fantacci@gmail.com>
Date: Sat, 25 Mar 2017 15:43:56 +0100
Subject: [PATCH 3/3] Add cuda::streams to by_rows and 8UC1 functions

Fix #8177
---
 modules/cudaobjdetect/src/cuda/hog.cu | 41 +++++++++------
 modules/cudaobjdetect/src/hog.cpp     | 75 +++++++++++++++++++--------
 2 files changed, 78 insertions(+), 38 deletions(-)

diff --git a/modules/cudaobjdetect/src/cuda/hog.cu b/modules/cudaobjdetect/src/cuda/hog.cu
index 40d2dec89c..45a3ecb838 100644
--- a/modules/cudaobjdetect/src/cuda/hog.cu
+++ b/modules/cudaobjdetect/src/cuda/hog.cu
@@ -518,8 +518,10 @@ namespace cv { namespace cuda { namespace device
 
 
         template <int nthreads>
-        __global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, const int win_block_stride_y,
-                                                      const float* block_hists, PtrStepf descriptors)
+        __global__ void extract_descrs_by_rows_kernel(const int img_block_width,
+                                                      const int win_block_stride_x, const int win_block_stride_y,
+                                                      const float* block_hists,
+                                                      PtrStepf descriptors)
         {
             // Get left top corner of the window in src
             const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
@@ -538,8 +540,14 @@ namespace cv { namespace cuda { namespace device
         }
 
 
-        void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x,
-                                    int height, int width, float* block_hists, int cell_size_x, int ncells_block_x, PtrStepSzf descriptors)
+        void extract_descrs_by_rows(int win_height, int win_width,
+                                    int block_stride_y, int block_stride_x,
+                                    int win_stride_y, int win_stride_x,
+                                    int height, int width,
+                                    float* block_hists, int cell_size_x,
+                                    int ncells_block_x,
+                                    PtrStepSzf descriptors,
+                                    const cudaStream_t& stream)
         {
             const int nthreads = 256;
 
@@ -551,17 +559,16 @@ namespace cv { namespace cuda { namespace device
             dim3 grid(img_win_width, img_win_height);
 
             int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) / block_stride_x;
-            extract_descrs_by_rows_kernel<nthreads><<<grid, threads>>>(
-                img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
-            cudaSafeCall( cudaGetLastError() );
+            extract_descrs_by_rows_kernel<nthreads><<<grid, threads, 0, stream>>>(img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
 
-            cudaSafeCall( cudaDeviceSynchronize() );
+            cudaSafeCall( cudaGetLastError() );
         }
 
 
         template <int nthreads>
-        __global__ void extract_descrs_by_cols_kernel(const int img_block_width, const int win_block_stride_x,
-                                                      const int win_block_stride_y, const float* block_hists,
+        __global__ void extract_descrs_by_cols_kernel(const int img_block_width,
+                                                      const int win_block_stride_x, const int win_block_stride_y,
+                                                      const float* block_hists,
                                                       PtrStepf descriptors)
         {
             // Get left top corner of the window in src
@@ -792,8 +799,12 @@ namespace cv { namespace cuda { namespace device
         }
 
 
-        void compute_gradients_8UC1(int nbins, int height, int width, const PtrStepSzb& img,
-                                    float angle_scale, PtrStepSzf grad, PtrStepSzb qangle, bool correct_gamma)
+        void compute_gradients_8UC1(int nbins,
+                                    int height, int width, const PtrStepSzb& img,
+                                    float angle_scale,
+                                    PtrStepSzf grad, PtrStepSzb qangle,
+                                    bool correct_gamma,
+                                    const cudaStream_t& stream)
         {
             (void)nbins;
             const int nthreads = 256;
@@ -802,13 +813,11 @@ namespace cv { namespace cuda { namespace device
             dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y));
 
             if (correct_gamma)
-                compute_gradients_8UC1_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
+                compute_gradients_8UC1_kernel<nthreads, 1><<<gdim, bdim, 0, stream>>>(height, width, img, angle_scale, grad, qangle);
             else
-                compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
+                compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim, 0, stream>>>(height, width, img, angle_scale, grad, qangle);
 
             cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
         }
 
 
diff --git a/modules/cudaobjdetect/src/hog.cpp b/modules/cudaobjdetect/src/hog.cpp
index f59bd0c545..f06ce363bb 100644
--- a/modules/cudaobjdetect/src/hog.cpp
+++ b/modules/cudaobjdetect/src/hog.cpp
@@ -64,19 +64,29 @@ namespace cv { namespace cuda { namespace device
 {
     namespace hog
     {
-        void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
+        void set_up_constants(int nbins,
+                              int block_stride_x, int block_stride_y,
                               int nblocks_win_x, int nblocks_win_y,
                               int ncells_block_x, int ncells_block_y,
                               const cudaStream_t& stream);
 
-        void compute_hists(int nbins, int block_stride_x, int block_stride_y,
-                           int height, int width, const PtrStepSzf& grad,
-                           const PtrStepSzb& qangle, float sigma, float* block_hists,
-                           int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y,
+        void compute_hists(int nbins,
+                           int block_stride_x, int block_stride_y,
+                           int height, int width,
+                           const PtrStepSzf& grad, const PtrStepSzb& qangle,
+                           float sigma,
+                           float* block_hists,
+                           int cell_size_x, int cell_size_y,
+                           int ncells_block_x, int ncells_block_y,
                            const cudaStream_t& stream);
 
-        void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
-                             int height, int width, float* block_hists, float threshold, int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y,
+        void normalize_hists(int nbins,
+                             int block_stride_x, int block_stride_y,
+                             int height, int width,
+                             float* block_hists,
+                             float threshold,
+                             int cell_size_x, int cell_size_y,
+                             int ncells_block_x, int ncells_block_y,
                              const cudaStream_t& stream);
 
         void classify_hists(int win_height, int win_width, int block_stride_y,
@@ -85,21 +95,37 @@ namespace cv { namespace cuda { namespace device
                             float threshold, int cell_size_x, int ncells_block_x, unsigned char* labels);
 
         void compute_confidence_hists(int win_height, int win_width, int block_stride_y, int block_stride_x,
-                           int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
-                           float* coefs, float free_coef, float threshold, int cell_size_x, int ncells_block_x, float *confidences);
-
-        void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x,
-                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int cell_size_x, int ncells_block_x,
-                                    cv::cuda::PtrStepSzf descriptors);
-        void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
-                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int cell_size_x, int ncells_block_x,
+                                      int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
+                                      float* coefs, float free_coef, float threshold, int cell_size_x, int ncells_block_x, float *confidences);
+
+        void extract_descrs_by_rows(int win_height, int win_width,
+                                    int block_stride_y, int block_stride_x,
+                                    int win_stride_y, int win_stride_x,
+                                    int height, int width,
+                                    float* block_hists,
+                                    int cell_size_x, int ncells_block_x,
+                                    cv::cuda::PtrStepSzf descriptors,
+                                    const cudaStream_t& stream);
+        void extract_descrs_by_cols(int win_height, int win_width,
+                                    int block_stride_y, int block_stride_x,
+                                    int win_stride_y, int win_stride_x,
+                                    int height, int width,
+                                    float* block_hists,
+                                    int cell_size_x, int ncells_block_x,
                                     cv::cuda::PtrStepSzf descriptors,
                                     const cudaStream_t& stream);
 
-        void compute_gradients_8UC1(int nbins, int height, int width, const cv::cuda::PtrStepSzb& img,
-                                    float angle_scale, cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, bool correct_gamma);
-        void compute_gradients_8UC4(int nbins, int height, int width, const cv::cuda::PtrStepSzb& img,
-                                    float angle_scale, cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, bool correct_gamma,
+        void compute_gradients_8UC1(int nbins,
+                                    int height, int width, const cv::cuda::PtrStepSzb& img,
+                                    float angle_scale,
+                                    cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle,
+                                    bool correct_gamma,
+                                    const cudaStream_t& stream);
+        void compute_gradients_8UC4(int nbins,
+                                    int height, int width, const cv::cuda::PtrStepSzb& img,
+                                    float angle_scale,
+                                    cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle,
+                                    bool correct_gamma,
                                     const cudaStream_t& stream);
 
         void resize_8UC1(const cv::cuda::PtrStepSzb& src, cv::cuda::PtrStepSzb dst);
@@ -483,7 +509,8 @@ namespace
                                         img.rows, img.cols,
                                         block_hists.ptr<float>(),
                                         cell_size_.width, cells_per_block_.width,
-                                        descriptors);
+                                        descriptors,
+                                        StreamAccessor::getStream(stream));
             break;
         case DESCR_FORMAT_COL_BY_COL:
             hog::extract_descrs_by_cols(win_size_.height, win_size_.width,
@@ -524,8 +551,12 @@ namespace
         switch (img.type())
         {
             case CV_8UC1:
-                hog::compute_gradients_8UC1(nbins_, img.rows, img.cols, img,
-                                            angleScale, grad, qangle, gamma_correction_);
+                hog::compute_gradients_8UC1(nbins_,
+                                            img.rows, img.cols, img,
+                                            angleScale,
+                                            grad, qangle,
+                                            gamma_correction_,
+                                            StreamAccessor::getStream(stream));
                 break;
             case CV_8UC4:
                 hog::compute_gradients_8UC4(nbins_,