|
|
@ -52,38 +52,41 @@ using namespace cv::cuda::device; |
|
|
|
|
|
|
|
|
|
|
|
namespace hist |
|
|
|
namespace hist |
|
|
|
{ |
|
|
|
{ |
|
|
|
__global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist) |
|
|
|
template<bool fourByteAligned> |
|
|
|
|
|
|
|
__global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist, const int offsetX = 0) |
|
|
|
{ |
|
|
|
{ |
|
|
|
__shared__ int shist[256]; |
|
|
|
__shared__ int shist[256]; |
|
|
|
|
|
|
|
|
|
|
|
const int y = blockIdx.x * blockDim.y + threadIdx.y; |
|
|
|
const int y = blockIdx.x * blockDim.y + threadIdx.y; |
|
|
|
const int tid = threadIdx.y * blockDim.x + threadIdx.x; |
|
|
|
const int tid = threadIdx.y * blockDim.x + threadIdx.x; |
|
|
|
|
|
|
|
const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX; |
|
|
|
shist[tid] = 0; |
|
|
|
shist[tid] = 0; |
|
|
|
__syncthreads(); |
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
|
|
if (y < rows) |
|
|
|
if (y < rows) { |
|
|
|
{ |
|
|
|
const uchar* rowPtr = &src[y * step]; |
|
|
|
const unsigned int* rowPtr = (const unsigned int*) (src + y * step); |
|
|
|
// load uncoalesced head |
|
|
|
|
|
|
|
if (!fourByteAligned && threadIdx.x == 0) { |
|
|
|
const int cols_4 = cols / 4; |
|
|
|
for (int x = 0; x < min(alignedOffset, cols); x++) |
|
|
|
for (int x = threadIdx.x; x < cols_4; x += blockDim.x) |
|
|
|
Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1); |
|
|
|
{ |
|
|
|
} |
|
|
|
unsigned int data = rowPtr[x]; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1); |
|
|
|
// coalesced loads |
|
|
|
Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1); |
|
|
|
const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * step] : &src[alignedOffset + y * step]); |
|
|
|
|
|
|
|
const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4; |
|
|
|
|
|
|
|
for (int x = threadIdx.x; x < cols_4; x += blockDim.x) { |
|
|
|
|
|
|
|
const unsigned int data = rowPtrIntAligned[x]; |
|
|
|
|
|
|
|
Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1); |
|
|
|
|
|
|
|
Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1); |
|
|
|
Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1); |
|
|
|
Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1); |
|
|
|
Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1); |
|
|
|
Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if (cols % 4 != 0 && threadIdx.x == 0) |
|
|
|
// load uncoalesced tail |
|
|
|
{ |
|
|
|
if (threadIdx.x == 0) { |
|
|
|
for (int x = cols_4 * 4; x < cols; ++x) |
|
|
|
const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset; |
|
|
|
{ |
|
|
|
for (int x = iTailStart; x < cols; x++) |
|
|
|
unsigned int data = ((const uchar*)rowPtr)[x]; |
|
|
|
Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1); |
|
|
|
Emulation::smem::atomicAdd(&shist[data], 1); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -94,61 +97,70 @@ namespace hist |
|
|
|
::atomicAdd(hist + tid, histVal); |
|
|
|
::atomicAdd(hist + tid, histVal); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream) |
|
|
|
void histogram256(PtrStepSzb src, int* hist, const int offsetX, cudaStream_t stream) |
|
|
|
{ |
|
|
|
{ |
|
|
|
const dim3 block(32, 8); |
|
|
|
const dim3 block(32, 8); |
|
|
|
const dim3 grid(divUp(src.rows, block.y)); |
|
|
|
const dim3 grid(divUp(src.rows, block.y)); |
|
|
|
|
|
|
|
if(offsetX) |
|
|
|
histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist); |
|
|
|
histogram256Kernel<false><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist, offsetX); |
|
|
|
|
|
|
|
else |
|
|
|
|
|
|
|
histogram256Kernel<true><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist, offsetX); |
|
|
|
cudaSafeCall( cudaGetLastError() ); |
|
|
|
cudaSafeCall( cudaGetLastError() ); |
|
|
|
|
|
|
|
|
|
|
|
if (stream == 0) |
|
|
|
if (stream == 0) |
|
|
|
cudaSafeCall( cudaDeviceSynchronize() ); |
|
|
|
cudaSafeCall( cudaDeviceSynchronize() ); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int* hist) |
|
|
|
template<bool fourByteAligned> |
|
|
|
|
|
|
|
__global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int* hist, const int offsetX = 0) |
|
|
|
{ |
|
|
|
{ |
|
|
|
__shared__ int shist[256]; |
|
|
|
__shared__ int shist[256]; |
|
|
|
|
|
|
|
|
|
|
|
const int y = blockIdx.x * blockDim.y + threadIdx.y; |
|
|
|
const int y = blockIdx.x * blockDim.y + threadIdx.y; |
|
|
|
const int tid = threadIdx.y * blockDim.x + threadIdx.x; |
|
|
|
const int tid = threadIdx.y * blockDim.x + threadIdx.x; |
|
|
|
|
|
|
|
const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX; |
|
|
|
shist[tid] = 0; |
|
|
|
shist[tid] = 0; |
|
|
|
__syncthreads(); |
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
|
|
if (y < rows) |
|
|
|
if (y < rows) |
|
|
|
{ |
|
|
|
{ |
|
|
|
const unsigned int* rowPtr = (const unsigned int*) (src + y * srcStep); |
|
|
|
const uchar* rowPtr = &src[y * srcStep]; |
|
|
|
const unsigned int* maskRowPtr = (const unsigned int*) (mask + y * maskStep); |
|
|
|
const uchar* maskRowPtr = &mask[y * maskStep]; |
|
|
|
|
|
|
|
// load uncoalesced head |
|
|
|
|
|
|
|
if (!fourByteAligned && threadIdx.x == 0) { |
|
|
|
|
|
|
|
for (int x = 0; x < min(alignedOffset, cols); x++) { |
|
|
|
|
|
|
|
if (maskRowPtr[x]) |
|
|
|
|
|
|
|
Emulation::smem::atomicAdd(&shist[rowPtr[x]], 1); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
const int cols_4 = cols / 4; |
|
|
|
// coalesced loads |
|
|
|
for (int x = threadIdx.x; x < cols_4; x += blockDim.x) |
|
|
|
const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * srcStep] : &src[alignedOffset + y * maskStep]); |
|
|
|
{ |
|
|
|
const unsigned int* maskRowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &mask[y * maskStep] : &mask[alignedOffset + y * maskStep]); |
|
|
|
unsigned int data = rowPtr[x]; |
|
|
|
const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4; |
|
|
|
unsigned int m = maskRowPtr[x]; |
|
|
|
for (int x = threadIdx.x; x < cols_4; x += blockDim.x) { |
|
|
|
|
|
|
|
const unsigned int data = rowPtrIntAligned[x]; |
|
|
|
|
|
|
|
const unsigned int m = maskRowPtrIntAligned[x]; |
|
|
|
|
|
|
|
|
|
|
|
if ((m >> 0) & 0xFFU) |
|
|
|
if ((m >> 0) & 0xFFU) |
|
|
|
Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1); |
|
|
|
Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1); |
|
|
|
|
|
|
|
|
|
|
|
if ((m >> 8) & 0xFFU) |
|
|
|
if ((m >> 8) & 0xFFU) |
|
|
|
Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1); |
|
|
|
Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1); |
|
|
|
|
|
|
|
|
|
|
|
if ((m >> 16) & 0xFFU) |
|
|
|
if ((m >> 16) & 0xFFU) |
|
|
|
Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1); |
|
|
|
Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1); |
|
|
|
|
|
|
|
|
|
|
|
if ((m >> 24) & 0xFFU) |
|
|
|
if ((m >> 24) & 0xFFU) |
|
|
|
Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1); |
|
|
|
Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if (cols % 4 != 0 && threadIdx.x == 0) |
|
|
|
// load uncoalesced tail |
|
|
|
{ |
|
|
|
if (threadIdx.x == 0) { |
|
|
|
for (int x = cols_4 * 4; x < cols; ++x) |
|
|
|
const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset; |
|
|
|
{ |
|
|
|
for (int x = iTailStart; x < cols; x++) { |
|
|
|
unsigned int data = ((const uchar*)rowPtr)[x]; |
|
|
|
if (maskRowPtr[x]) |
|
|
|
unsigned int m = ((const uchar*)maskRowPtr)[x]; |
|
|
|
Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1); |
|
|
|
|
|
|
|
|
|
|
|
if (m) |
|
|
|
|
|
|
|
Emulation::smem::atomicAdd(&shist[data], 1); |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
@ -160,12 +172,15 @@ namespace hist |
|
|
|
::atomicAdd(hist + tid, histVal); |
|
|
|
::atomicAdd(hist + tid, histVal); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream) |
|
|
|
void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, const int offsetX, cudaStream_t stream) |
|
|
|
{ |
|
|
|
{ |
|
|
|
const dim3 block(32, 8); |
|
|
|
const dim3 block(32, 8); |
|
|
|
const dim3 grid(divUp(src.rows, block.y)); |
|
|
|
const dim3 grid(divUp(src.rows, block.y)); |
|
|
|
|
|
|
|
|
|
|
|
histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist); |
|
|
|
if(offsetX) |
|
|
|
|
|
|
|
histogram256Kernel<false><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist, offsetX); |
|
|
|
|
|
|
|
else |
|
|
|
|
|
|
|
histogram256Kernel<true><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist, offsetX); |
|
|
|
cudaSafeCall( cudaGetLastError() ); |
|
|
|
cudaSafeCall( cudaGetLastError() ); |
|
|
|
|
|
|
|
|
|
|
|
if (stream == 0) |
|
|
|
if (stream == 0) |
|
|
@ -186,42 +201,44 @@ namespace hist |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols, |
|
|
|
template<bool fourByteAligned> |
|
|
|
int* hist, const int binCount, const int binSize, const int lowerLevel, const int upperLevel) |
|
|
|
__global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols, int* hist, const int binCount, const int binSize, |
|
|
|
|
|
|
|
const int lowerLevel, const int upperLevel, const int offsetX) |
|
|
|
{ |
|
|
|
{ |
|
|
|
extern __shared__ int shist[]; |
|
|
|
extern __shared__ int shist[]; |
|
|
|
|
|
|
|
|
|
|
|
const int y = blockIdx.x * blockDim.y + threadIdx.y; |
|
|
|
const int y = blockIdx.x * blockDim.y + threadIdx.y; |
|
|
|
const int tid = threadIdx.y * blockDim.x + threadIdx.x; |
|
|
|
const int tid = threadIdx.y * blockDim.x + threadIdx.x; |
|
|
|
|
|
|
|
const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX; |
|
|
|
if (tid < binCount) |
|
|
|
if (tid < binCount) |
|
|
|
shist[tid] = 0; |
|
|
|
shist[tid] = 0; |
|
|
|
|
|
|
|
|
|
|
|
__syncthreads(); |
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
|
|
if (y < rows) |
|
|
|
if (y < rows) |
|
|
|
{ |
|
|
|
{ |
|
|
|
const uchar* rowPtr = src + y * step; |
|
|
|
const uchar* rowPtr = &src[y * step]; |
|
|
|
const uint* rowPtr4 = (uint*) rowPtr; |
|
|
|
// load uncoalesced head |
|
|
|
|
|
|
|
if (!fourByteAligned && threadIdx.x == 0) { |
|
|
|
const int cols_4 = cols / 4; |
|
|
|
for (int x = 0; x < min(alignedOffset, cols); x++) |
|
|
|
for (int x = threadIdx.x; x < cols_4; x += blockDim.x) |
|
|
|
histEvenInc(shist, rowPtr[x], binSize, lowerLevel, upperLevel); |
|
|
|
{ |
|
|
|
} |
|
|
|
const uint data = rowPtr4[x]; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
histEvenInc(shist, (data >> 0) & 0xFFU, binSize, lowerLevel, upperLevel); |
|
|
|
// coalesced loads |
|
|
|
histEvenInc(shist, (data >> 8) & 0xFFU, binSize, lowerLevel, upperLevel); |
|
|
|
const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * step] : &src[alignedOffset + y * step]); |
|
|
|
|
|
|
|
const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4; |
|
|
|
|
|
|
|
for (int x = threadIdx.x; x < cols_4; x += blockDim.x) { |
|
|
|
|
|
|
|
const unsigned int data = rowPtrIntAligned[x]; |
|
|
|
|
|
|
|
histEvenInc(shist, (data >> 0) & 0xFFU, binSize, lowerLevel, upperLevel); |
|
|
|
|
|
|
|
histEvenInc(shist, (data >> 8) & 0xFFU, binSize, lowerLevel, upperLevel); |
|
|
|
histEvenInc(shist, (data >> 16) & 0xFFU, binSize, lowerLevel, upperLevel); |
|
|
|
histEvenInc(shist, (data >> 16) & 0xFFU, binSize, lowerLevel, upperLevel); |
|
|
|
histEvenInc(shist, (data >> 24) & 0xFFU, binSize, lowerLevel, upperLevel); |
|
|
|
histEvenInc(shist, (data >> 24) & 0xFFU, binSize, lowerLevel, upperLevel); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if (cols % 4 != 0 && threadIdx.x == 0) |
|
|
|
// load uncoalesced tail |
|
|
|
{ |
|
|
|
if (threadIdx.x == 0) { |
|
|
|
for (int x = cols_4 * 4; x < cols; ++x) |
|
|
|
const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset; |
|
|
|
{ |
|
|
|
for (int x = iTailStart; x < cols; x++) |
|
|
|
const uchar data = rowPtr[x]; |
|
|
|
histEvenInc(shist, rowPtr[x], binSize, lowerLevel, upperLevel); |
|
|
|
histEvenInc(shist, data, binSize, lowerLevel, upperLevel); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -236,7 +253,7 @@ namespace hist |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream) |
|
|
|
void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, const int offsetX, cudaStream_t stream) |
|
|
|
{ |
|
|
|
{ |
|
|
|
const dim3 block(32, 8); |
|
|
|
const dim3 block(32, 8); |
|
|
|
const dim3 grid(divUp(src.rows, block.y)); |
|
|
|
const dim3 grid(divUp(src.rows, block.y)); |
|
|
@ -245,7 +262,10 @@ namespace hist |
|
|
|
|
|
|
|
|
|
|
|
const size_t smem_size = binCount * sizeof(int); |
|
|
|
const size_t smem_size = binCount * sizeof(int); |
|
|
|
|
|
|
|
|
|
|
|
histEven8u<<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel); |
|
|
|
if(offsetX) |
|
|
|
|
|
|
|
histEven8u<false><<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel, offsetX); |
|
|
|
|
|
|
|
else |
|
|
|
|
|
|
|
histEven8u<true><<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel, offsetX); |
|
|
|
cudaSafeCall( cudaGetLastError() ); |
|
|
|
cudaSafeCall( cudaGetLastError() ); |
|
|
|
|
|
|
|
|
|
|
|
if (stream == 0) |
|
|
|
if (stream == 0) |
|
|
|