@ -52,6 +52,7 @@ namespace cv { namespace cuda { namespace device
namespace hog
{
__constant__ int cnbins;
__constant__ int cblock_stride_x;
__constant__ int cblock_stride_y;
@ -99,27 +100,28 @@ namespace cv { namespace cuda { namespace device
void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
int nblocks_win_x, int nblocks_win_y, int ncells_block_x, int ncells_block_y)
int nblocks_win_x, int nblocks_win_y, int ncells_block_x, int ncells_block_y,
const cudaStream_t& stream)
{
cudaSafeCall( cudaMemcpyToSymbol(cnbins, &nbins, sizeof(nbins)) );
cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_x, &block_stride_x, sizeof(block_stride_x)) );
cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_y, &block_stride_y, sizeof(block_stride_y)) );
cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_x, &nblocks_win_x, sizeof(nblocks_win_x)) );
cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_y, &nblocks_win_y, sizeof(nblocks_win_y)) );
cudaSafeCall( cudaMemcpyToSymbol(cncells_block_x, &ncells_block_x, sizeof(ncells_block_x)) );
cudaSafeCall( cudaMemcpyToSymbol(cncells_block_y, &ncells_block_y, sizeof(ncells_block_y)) );
cudaSafeCall(cudaMemcpyToSymbolAsync(cnbins, &nbins, sizeof(nbins), 0, cudaMemcpyHostToDevice, stream) );
cudaSafeCall(cudaMemcpyToSymbolAsync(cblock_stride_x, &block_stride_x, sizeof(block_stride_x), 0, cudaMemcpyHostToDevice, stream) );
cudaSafeCall(cudaMemcpyToSymbolAsync(cblock_stride_y, &block_stride_y, sizeof(block_stride_y), 0, cudaMemcpyHostToDevice, stream) );
cudaSafeCall(cudaMemcpyToSymbolAsync(cnblocks_win_x, &nblocks_win_x, sizeof(nblocks_win_x), 0, cudaMemcpyHostToDevice, stream) );
cudaSafeCall(cudaMemcpyToSymbolAsync(cnblocks_win_y, &nblocks_win_y, sizeof(nblocks_win_y), 0, cudaMemcpyHostToDevice, stream) );
cudaSafeCall(cudaMemcpyToSymbolAsync(cncells_block_x, &ncells_block_x, sizeof(ncells_block_x), 0, cudaMemcpyHostToDevice, stream) );
cudaSafeCall(cudaMemcpyToSymbolAsync(cncells_block_y, &ncells_block_y, sizeof(ncells_block_y), 0, cudaMemcpyHostToDevice, stream) );
int block_hist_size = nbins * ncells_block_x * ncells_block_y;
cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size, &block_hist_size, sizeof(block_hist_size)) );
cudaSafeCall(cudaMemcpyToSymbolAsync(cblock_hist_size, &block_hist_size, sizeof(block_hist_size), 0, cudaMemcpyHostToDevice, stream) );
int block_hist_size_2up = power_2up(block_hist_size);
cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size_2up, &block_hist_size_2up, sizeof(block_hist_size_2up)) );
cudaSafeCall(cudaMemcpyToSymbolAsync (cblock_hist_size_2up, &block_hist_size_2up, sizeof(block_hist_size_2up), 0, cudaMemcpyHostToDevice, stream) );
int descr_width = nblocks_win_x * block_hist_size;
cudaSafeCall( cudaMemcpyToSymbol(cdescr_width, &descr_width, sizeof(descr_width)) );
cudaSafeCall(cudaMemcpyToSymbolAsync(cdescr_width, &descr_width, sizeof(descr_width), 0, cudaMemcpyHostToDevice, stream) );
int descr_size = descr_width * nblocks_win_y;
cudaSafeCall( cudaMemcpyToSymbol(cdescr_size, &descr_size, sizeof(descr_size)) );
cudaSafeCall(cudaMemcpyToSymbolAsync(cdescr_size, &descr_size, sizeof(descr_size), 0, cudaMemcpyHostToDevice, stream) );
}
@ -233,7 +235,8 @@ namespace cv { namespace cuda { namespace device
void compute_hists(int nbins, int block_stride_x, int block_stride_y,
int height, int width, const PtrStepSzf& grad,
const PtrStepSzb& qangle, float sigma, float* block_hists,
int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y)
int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y,
const cudaStream_t& stream)
{
const int ncells_block = ncells_block_x * ncells_block_y;
const int patch_side = cell_size_x / 4;
@ -259,20 +262,15 @@ namespace cv { namespace cuda { namespace device
int final_hists_size = (nbins * ncells_block * nblocks) * sizeof(float);
int smem = hists_size + final_hists_size;
if (nblocks == 4)
compute_hists_kernel_many_blocks<4><<<grid, threads, smem>>>(
img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
compute_hists_kernel_many_blocks<4><<<grid, threads, smem, stream>>>(img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
else if (nblocks == 3)
compute_hists_kernel_many_blocks<3><<<grid, threads, smem>>>(
img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
compute_hists_kernel_many_blocks<3><<<grid, threads, smem, stream>>>(img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
else if (nblocks == 2)
compute_hists_kernel_many_blocks<2><<<grid, threads, smem>>>(
img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
compute_hists_kernel_many_blocks<2><<<grid, threads, smem, stream>>>(img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
else
compute_hists_kernel_many_blocks<1><<<grid, threads, smem>>>(
img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
cudaSafeCall( cudaGetLastError() );
compute_hists_kernel_many_blocks<1><<<grid, threads, smem, stream>>>(img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
cudaSafeCall( cudaDeviceSynchronize () );
cudaSafeCall( cudaGetLastError() );
}
@ -348,7 +346,8 @@ namespace cv { namespace cuda { namespace device
void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
int height, int width, float* block_hists, float threshold, int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y)
int height, int width, float* block_hists, float threshold, int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y,
const cudaStream_t& stream)
{
const int nblocks = 1;
@ -361,21 +360,19 @@ namespace cv { namespace cuda { namespace device
dim3 grid(divUp(img_block_width, nblocks), img_block_height);
if (nthreads == 32)
normalize_hists_kernel_many_blocks<32, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
normalize_hists_kernel_many_blocks<32, nblocks><<<grid, threads, 0, stream >>>(block_hist_size, img_block_width, block_hists, threshold);
else if (nthreads == 64)
normalize_hists_kernel_many_blocks<64, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
normalize_hists_kernel_many_blocks<64, nblocks><<<grid, threads, 0, stream >>>(block_hist_size, img_block_width, block_hists, threshold);
else if (nthreads == 128)
normalize_hists_kernel_many_blocks<128, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
normalize_hists_kernel_many_blocks<128, nblocks><<<grid, threads, 0, stream >>>(block_hist_size, img_block_width, block_hists, threshold);
else if (nthreads == 256)
normalize_hists_kernel_many_blocks<256, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
normalize_hists_kernel_many_blocks<256, nblocks><<<grid, threads, 0, stream >>>(block_hist_size, img_block_width, block_hists, threshold);
else if (nthreads == 512)
normalize_hists_kernel_many_blocks<512, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
normalize_hists_kernel_many_blocks<512, nblocks><<<grid, threads, 0, stream >>>(block_hist_size, img_block_width, block_hists, threshold);
else
CV_Error(cv::Error::StsBadArg, "normalize_hists: histogram's size is too big, try to decrease number of bins");
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
@ -581,7 +578,8 @@ namespace cv { namespace cuda { namespace device
void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int cell_size_x, int ncells_block_x,
PtrStepSzf descriptors)
PtrStepSzf descriptors,
const cudaStream_t& stream)
{
const int nthreads = 256;
@ -593,11 +591,9 @@ namespace cv { namespace cuda { namespace device
dim3 grid(img_win_width, img_win_height);
int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) / block_stride_x;
extract_descrs_by_cols_kernel<nthreads><<<grid, threads>>>(
img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
cudaSafeCall( cudaGetLastError() );
extract_descrs_by_cols_kernel<nthreads><<<grid, threads, 0, stream>>>(img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
cudaSafeCall( cudaDeviceSynchronize () );
cudaSafeCall( cudaGetLastError() );
}
//----------------------------------------------------------------------------
@ -708,7 +704,8 @@ namespace cv { namespace cuda { namespace device
void compute_gradients_8UC4(int nbins, int height, int width, const PtrStepSzb& img,
float angle_scale, PtrStepSzf grad, PtrStepSzb qangle, bool correct_gamma)
float angle_scale, PtrStepSzf grad, PtrStepSzb qangle, bool correct_gamma,
const cudaStream_t& stream)
{
(void)nbins;
const int nthreads = 256;
@ -717,13 +714,11 @@ namespace cv { namespace cuda { namespace device
dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y));
if (correct_gamma)
compute_gradients_8UC4_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
compute_gradients_8UC4_kernel<nthreads, 1><<<gdim, bdim, 0, stream >>>(height, width, img, angle_scale, grad, qangle);
else
compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim, 0, stream >>>(height, width, img, angle_scale, grad, qangle);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int nthreads, int correct_gamma>