|
|
|
@ -357,18 +357,25 @@ namespace cv { namespace gpu { namespace device |
|
|
|
|
#endif |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
void shfl_integral_gpu(PtrStepSzb img, PtrStepSz<unsigned int> integral, cudaStream_t stream) |
|
|
|
|
void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream) |
|
|
|
|
{ |
|
|
|
|
{ |
|
|
|
|
// each thread handles 16 values, use 1 block/row |
|
|
|
|
const int block = img.cols / 16; |
|
|
|
|
int block = img.cols / 16; |
|
|
|
|
|
|
|
|
|
// save, becouse step is actually can't be less 512 bytes |
|
|
|
|
int align = img.cols % 4; |
|
|
|
|
if ( align != 0) |
|
|
|
|
{ |
|
|
|
|
block += (4 - align); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// launch 1 block / row |
|
|
|
|
const int grid = img.rows; |
|
|
|
|
|
|
|
|
|
cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) ); |
|
|
|
|
|
|
|
|
|
shfl_integral_horizontal<<<grid, block, 0, stream>>>((PtrStepSz<uint4>) img, (PtrStepSz<uint4>) integral); |
|
|
|
|
shfl_integral_horizontal<<<grid, block, 0, stream>>>((const PtrStepSz<uint4>) img, (PtrStepSz<uint4>) integral); |
|
|
|
|
cudaSafeCall( cudaGetLastError() ); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|