|
|
@ -48,11 +48,11 @@ using namespace cv::cudev; |
|
|
|
|
|
|
|
|
|
|
|
namespace clahe |
|
|
|
namespace clahe |
|
|
|
{ |
|
|
|
{ |
|
|
|
__global__ void calcLutKernel(const PtrStepb src, PtrStepb lut, |
|
|
|
__global__ void calcLutKernel_8U(const PtrStepb src, PtrStepb lut, |
|
|
|
const int2 tileSize, const int tilesX, |
|
|
|
const int2 tileSize, const int tilesX, |
|
|
|
const int clipLimit, const float lutScale) |
|
|
|
const int clipLimit, const float lutScale) |
|
|
|
{ |
|
|
|
{ |
|
|
|
__shared__ int smem[512]; |
|
|
|
__shared__ int smem[256]; |
|
|
|
|
|
|
|
|
|
|
|
const int tx = blockIdx.x; |
|
|
|
const int tx = blockIdx.x; |
|
|
|
const int ty = blockIdx.y; |
|
|
|
const int ty = blockIdx.y; |
|
|
@ -95,18 +95,28 @@ namespace clahe |
|
|
|
// broadcast evaluated value |
|
|
|
// broadcast evaluated value |
|
|
|
|
|
|
|
|
|
|
|
__shared__ int totalClipped; |
|
|
|
__shared__ int totalClipped; |
|
|
|
|
|
|
|
__shared__ int redistBatch; |
|
|
|
|
|
|
|
__shared__ int residual; |
|
|
|
|
|
|
|
__shared__ int rStep; |
|
|
|
|
|
|
|
|
|
|
|
if (tid == 0) |
|
|
|
if (tid == 0) |
|
|
|
|
|
|
|
{ |
|
|
|
totalClipped = clipped; |
|
|
|
totalClipped = clipped; |
|
|
|
|
|
|
|
redistBatch = totalClipped / 256; |
|
|
|
|
|
|
|
residual = totalClipped - redistBatch * 256; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
rStep = 1; |
|
|
|
|
|
|
|
if (residual != 0) |
|
|
|
|
|
|
|
rStep = 256 / residual; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__syncthreads(); |
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
|
|
// redistribute clipped samples evenly |
|
|
|
// redistribute clipped samples evenly |
|
|
|
|
|
|
|
|
|
|
|
int redistBatch = totalClipped / 256; |
|
|
|
|
|
|
|
tHistVal += redistBatch; |
|
|
|
tHistVal += redistBatch; |
|
|
|
|
|
|
|
|
|
|
|
int residual = totalClipped - redistBatch * 256; |
|
|
|
if (residual && tid % rStep == 0 && tid / rStep < residual) |
|
|
|
if (tid < residual) |
|
|
|
|
|
|
|
++tHistVal; |
|
|
|
++tHistVal; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -115,12 +125,212 @@ namespace clahe |
|
|
|
lut(ty * tilesX + tx, tid) = saturate_cast<uchar>(__float2int_rn(lutScale * lutVal)); |
|
|
|
lut(ty * tilesX + tx, tid) = saturate_cast<uchar>(__float2int_rn(lutScale * lutVal)); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
void calcLut(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream) |
|
|
|
__global__ void calcLutKernel_16U(const PtrStepus src, PtrStepus lut, |
|
|
|
|
|
|
|
const int2 tileSize, const int tilesX, |
|
|
|
|
|
|
|
const int clipLimit, const float lutScale, |
|
|
|
|
|
|
|
PtrStepSzi hist) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
#define histSize 65536 |
|
|
|
|
|
|
|
#define blockSize 256 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__shared__ int smem[blockSize]; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const int tx = blockIdx.x; |
|
|
|
|
|
|
|
const int ty = blockIdx.y; |
|
|
|
|
|
|
|
const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const int histRow = ty * tilesX + tx; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// build histogram |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (int i = tid; i < histSize; i += blockSize) |
|
|
|
|
|
|
|
hist(histRow, i) = 0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (int i = threadIdx.y; i < tileSize.y; i += blockDim.y) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
const ushort* srcPtr = src.ptr(ty * tileSize.y + i) + tx * tileSize.x; |
|
|
|
|
|
|
|
for (int j = threadIdx.x; j < tileSize.x; j += blockDim.x) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
const int data = srcPtr[j]; |
|
|
|
|
|
|
|
::atomicAdd(&hist(histRow, data), 1); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (clipLimit > 0) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
// clip histogram bar && |
|
|
|
|
|
|
|
// find number of overall clipped samples |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__shared__ int partialSum[blockSize]; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (int i = tid; i < histSize; i += blockSize) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
int histVal = hist(histRow, i); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int clipped = 0; |
|
|
|
|
|
|
|
if (histVal > clipLimit) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
clipped = histVal - clipLimit; |
|
|
|
|
|
|
|
hist(histRow, i) = clipLimit; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Following code block is in effect equivalent to: |
|
|
|
|
|
|
|
// |
|
|
|
|
|
|
|
// blockReduce<blockSize>(smem, clipped, tid, plus<int>()); |
|
|
|
|
|
|
|
// |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
for (int j = 16; j >= 1; j /= 2) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
#if __CUDACC_VER_MAJOR__ >= 9 |
|
|
|
|
|
|
|
int val = __shfl_down_sync(0xFFFFFFFFU, clipped, j); |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
int val = __shfl_down(clipped, j); |
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
clipped += val; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (tid % 32 == 0) |
|
|
|
|
|
|
|
smem[tid / 32] = clipped; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (tid < 8) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
clipped = smem[tid]; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (int j = 4; j >= 1; j /= 2) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
#if __CUDACC_VER_MAJOR__ >= 9 |
|
|
|
|
|
|
|
int val = __shfl_down_sync(0x000000FFU, clipped, j); |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
int val = __shfl_down(clipped, j); |
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
clipped += val; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
// end of code block |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (tid == 0) |
|
|
|
|
|
|
|
partialSum[i / blockSize] = clipped; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int partialSum_ = partialSum[tid]; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Following code block is in effect equivalent to: |
|
|
|
|
|
|
|
// |
|
|
|
|
|
|
|
// blockReduce<blockSize>(smem, partialSum_, tid, plus<int>()); |
|
|
|
|
|
|
|
// |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
for (int j = 16; j >= 1; j /= 2) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
#if __CUDACC_VER_MAJOR__ >= 9 |
|
|
|
|
|
|
|
int val = __shfl_down_sync(0xFFFFFFFFU, partialSum_, j); |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
int val = __shfl_down(partialSum_, j); |
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
partialSum_ += val; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (tid % 32 == 0) |
|
|
|
|
|
|
|
smem[tid / 32] = partialSum_; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (tid < 8) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
partialSum_ = smem[tid]; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (int j = 4; j >= 1; j /= 2) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
#if __CUDACC_VER_MAJOR__ >= 9 |
|
|
|
|
|
|
|
int val = __shfl_down_sync(0x000000FFU, partialSum_, j); |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
int val = __shfl_down(partialSum_, j); |
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
partialSum_ += val; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
// end of code block |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// broadcast evaluated value && |
|
|
|
|
|
|
|
// redistribute clipped samples evenly |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__shared__ int totalClipped; |
|
|
|
|
|
|
|
__shared__ int redistBatch; |
|
|
|
|
|
|
|
__shared__ int residual; |
|
|
|
|
|
|
|
__shared__ int rStep; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (tid == 0) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
totalClipped = partialSum_; |
|
|
|
|
|
|
|
redistBatch = totalClipped / histSize; |
|
|
|
|
|
|
|
residual = totalClipped - redistBatch * histSize; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
rStep = 1; |
|
|
|
|
|
|
|
if (residual != 0) |
|
|
|
|
|
|
|
rStep = histSize / residual; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (int i = tid; i < histSize; i += blockSize) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
int histVal = hist(histRow, i); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int equalized = histVal + redistBatch; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (residual && i % rStep == 0 && i / rStep < residual) |
|
|
|
|
|
|
|
++equalized; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
hist(histRow, i) = equalized; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__shared__ int partialScan[blockSize]; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (int i = tid; i < histSize; i += blockSize) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
int equalized = hist(histRow, i); |
|
|
|
|
|
|
|
equalized = blockScanInclusive<blockSize>(equalized, smem, tid); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (tid == blockSize - 1) |
|
|
|
|
|
|
|
partialScan[i / blockSize] = equalized; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
hist(histRow, i) = equalized; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int partialScan_ = partialScan[tid]; |
|
|
|
|
|
|
|
partialScan[tid] = blockScanExclusive<blockSize>(partialScan_, smem, tid); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (int i = tid; i < histSize; i += blockSize) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
const int lutVal = hist(histRow, i) + partialScan[i / blockSize]; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
lut(histRow, i) = saturate_cast<ushort>(__float2int_rn(lutScale * lutVal)); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#undef histSize |
|
|
|
|
|
|
|
#undef blockSize |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void calcLut_8U(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream) |
|
|
|
{ |
|
|
|
{ |
|
|
|
const dim3 block(32, 8); |
|
|
|
const dim3 block(32, 8); |
|
|
|
const dim3 grid(tilesX, tilesY); |
|
|
|
const dim3 grid(tilesX, tilesY); |
|
|
|
|
|
|
|
|
|
|
|
calcLutKernel<<<grid, block, 0, stream>>>(src, lut, tileSize, tilesX, clipLimit, lutScale); |
|
|
|
calcLutKernel_8U<<<grid, block, 0, stream>>>(src, lut, tileSize, tilesX, clipLimit, lutScale); |
|
|
|
|
|
|
|
|
|
|
|
CV_CUDEV_SAFE_CALL( cudaGetLastError() ); |
|
|
|
CV_CUDEV_SAFE_CALL( cudaGetLastError() ); |
|
|
|
|
|
|
|
|
|
|
@ -128,7 +338,21 @@ namespace clahe |
|
|
|
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); |
|
|
|
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__global__ void transformKernel(const PtrStepSzb src, PtrStepb dst, const PtrStepb lut, const int2 tileSize, const int tilesX, const int tilesY) |
|
|
|
void calcLut_16U(PtrStepSzus src, PtrStepus lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, PtrStepSzi hist, cudaStream_t stream) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
const dim3 block(32, 8); |
|
|
|
|
|
|
|
const dim3 grid(tilesX, tilesY); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
calcLutKernel_16U<<<grid, block, 0, stream>>>(src, lut, tileSize, tilesX, clipLimit, lutScale, hist); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CV_CUDEV_SAFE_CALL( cudaGetLastError() ); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (stream == 0) |
|
|
|
|
|
|
|
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T> |
|
|
|
|
|
|
|
__global__ void transformKernel(const PtrStepSz<T> src, PtrStep<T> dst, const PtrStep<T> lut, const int2 tileSize, const int tilesX, const int tilesY) |
|
|
|
{ |
|
|
|
{ |
|
|
|
const int x = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
|
const int x = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
|
const int y = blockIdx.y * blockDim.y + threadIdx.y; |
|
|
|
const int y = blockIdx.y * blockDim.y + threadIdx.y; |
|
|
@ -159,22 +383,26 @@ namespace clahe |
|
|
|
res += lut(ty2 * tilesX + tx1, srcVal) * ((1.0f - xa) * (ya)); |
|
|
|
res += lut(ty2 * tilesX + tx1, srcVal) * ((1.0f - xa) * (ya)); |
|
|
|
res += lut(ty2 * tilesX + tx2, srcVal) * ((xa) * (ya)); |
|
|
|
res += lut(ty2 * tilesX + tx2, srcVal) * ((xa) * (ya)); |
|
|
|
|
|
|
|
|
|
|
|
dst(y, x) = saturate_cast<uchar>(res); |
|
|
|
dst(y, x) = saturate_cast<T>(res); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
void transform(PtrStepSzb src, PtrStepSzb dst, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream) |
|
|
|
template <typename T> |
|
|
|
|
|
|
|
void transform(PtrStepSz<T> src, PtrStepSz<T> dst, PtrStep<T> lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream) |
|
|
|
{ |
|
|
|
{ |
|
|
|
const dim3 block(32, 8); |
|
|
|
const dim3 block(32, 8); |
|
|
|
const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); |
|
|
|
const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); |
|
|
|
|
|
|
|
|
|
|
|
CV_CUDEV_SAFE_CALL( cudaFuncSetCacheConfig(transformKernel, cudaFuncCachePreferL1) ); |
|
|
|
CV_CUDEV_SAFE_CALL( cudaFuncSetCacheConfig(transformKernel<T>, cudaFuncCachePreferL1) ); |
|
|
|
|
|
|
|
|
|
|
|
transformKernel<<<grid, block, 0, stream>>>(src, dst, lut, tileSize, tilesX, tilesY); |
|
|
|
transformKernel<T><<<grid, block, 0, stream>>>(src, dst, lut, tileSize, tilesX, tilesY); |
|
|
|
CV_CUDEV_SAFE_CALL( cudaGetLastError() ); |
|
|
|
CV_CUDEV_SAFE_CALL( cudaGetLastError() ); |
|
|
|
|
|
|
|
|
|
|
|
if (stream == 0) |
|
|
|
if (stream == 0) |
|
|
|
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); |
|
|
|
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template void transform<uchar>(PtrStepSz<uchar> src, PtrStepSz<uchar> dst, PtrStep<uchar> lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream); |
|
|
|
|
|
|
|
template void transform<ushort>(PtrStepSz<ushort> src, PtrStepSz<ushort> dst, PtrStep<ushort> lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
#endif // CUDA_DISABLER |
|
|
|
#endif // CUDA_DISABLER |
|
|
|