diff --git a/modules/cudaimgproc/src/cuda/hist.cu b/modules/cudaimgproc/src/cuda/hist.cu index be13091f12..6bc5f15e6c 100644 --- a/modules/cudaimgproc/src/cuda/hist.cu +++ b/modules/cudaimgproc/src/cuda/hist.cu @@ -257,18 +257,15 @@ namespace hist namespace hist { - __constant__ int c_lut[256]; - struct EqualizeHist : unary_function { - float scale; + const uchar* lut; - __host__ EqualizeHist(float _scale) : scale(_scale) {} + __host__ EqualizeHist(const uchar* _lut) : lut(_lut) {} __device__ __forceinline__ uchar operator ()(uchar val) const { - const int lut = c_lut[val]; - return __float2int_rn(scale * lut); + return lut[val]; } }; } @@ -283,16 +280,137 @@ namespace cv { namespace cuda { namespace device namespace hist { - void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream) + void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const uchar* lut, cudaStream_t stream) { - if (stream == 0) - cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) ); - else - cudaSafeCall( cudaMemcpyToSymbolAsync(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice, stream) ); + device::transform(src, dst, EqualizeHist(lut), WithOutMask(), stream); + } + + __global__ void buildLutKernel(int* hist, unsigned char* lut, int size) + { + __shared__ int warp_smem[8]; + __shared__ int hist_smem[8][33]; + +#define HIST_SMEM_NO_BANK_CONFLICT(idx) hist_smem[(idx) >> 5][(idx) & 31] + + const int tId = threadIdx.x; + const int warpId = threadIdx.x / 32; + const int laneId = threadIdx.x % 32; + + // Step1 - Find minimum non-zero value in hist and make it zero + HIST_SMEM_NO_BANK_CONFLICT(tId) = hist[tId]; + int nonZeroIdx = HIST_SMEM_NO_BANK_CONFLICT(tId) > 0 ? tId : 256; + + __syncthreads(); + + for (int delta = 16; delta > 0; delta /= 2) + { +#if __CUDACC_VER_MAJOR__ >= 9 + int shflVal = __shfl_down_sync(0xFFFFFFFF, nonZeroIdx, delta); +#else + int shflVal = __shfl_down(nonZeroIdx, delta); +#endif + if (laneId < delta) + nonZeroIdx = min(nonZeroIdx, shflVal); + } + + if (laneId == 0) + warp_smem[warpId] = nonZeroIdx; - const float scale = 255.0f / (src.cols * src.rows); + __syncthreads(); + + if (tId < 8) + { + int warpVal = warp_smem[tId]; + for (int delta = 4; delta > 0; delta /= 2) + { +#if __CUDACC_VER_MAJOR__ >= 9 + int shflVal = __shfl_down_sync(0x000000FF, warpVal, delta); +#else + int shflVal = __shfl_down(warpVal, delta); +#endif + if (tId < delta) + warpVal = min(warpVal, shflVal); + } + if (tId == 0) + { + warp_smem[0] = warpVal; // warpVal - minimum index + } + } + + __syncthreads(); + + const int minNonZeroIdx = warp_smem[0]; + const int minNonZeroVal = HIST_SMEM_NO_BANK_CONFLICT(minNonZeroIdx); + if (minNonZeroVal == size) + { + // This is a special case: the whole image has the same color + + lut[tId] = 0; + if (tId == minNonZeroIdx) + lut[tId] = minNonZeroIdx; + return; + } - device::transform(src, dst, EqualizeHist(scale), WithOutMask(), stream); + if (tId == 0) + HIST_SMEM_NO_BANK_CONFLICT(minNonZeroIdx) = 0; + + __syncthreads(); + + // Step2 - Inclusive sum + // Algorithm from GPU Gems 3 (A Work-Efficient Parallel Scan) + // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda + + // Step2 Phase1 - The Up-Sweep Phase + for (int delta = 1; delta < 256; delta *= 2) + { + if (tId < 128 / delta) + { + int idx = 255 - 2 * tId * delta; + HIST_SMEM_NO_BANK_CONFLICT(idx) += HIST_SMEM_NO_BANK_CONFLICT(idx - delta); + } + __syncthreads(); + } + + // Step2 Phase2 - The Down-Sweep Phase + if (tId == 0) + HIST_SMEM_NO_BANK_CONFLICT(255) = 0; + + for (int delta = 128; delta >= 1; delta /= 2) + { + if (tId < 128 / delta) + { + int rootIdx = 255 - tId * delta * 2; + int leftIdx = rootIdx - delta; + int tmp = HIST_SMEM_NO_BANK_CONFLICT(leftIdx); + HIST_SMEM_NO_BANK_CONFLICT(leftIdx) = HIST_SMEM_NO_BANK_CONFLICT(rootIdx); + HIST_SMEM_NO_BANK_CONFLICT(rootIdx) += tmp; + } + __syncthreads(); + } + + // Step2 Phase3 - Convert exclusive sum to inclusive sum + int tmp = HIST_SMEM_NO_BANK_CONFLICT(tId); + __syncthreads(); + if (tId >= 1) + HIST_SMEM_NO_BANK_CONFLICT(tId - 1) = tmp; + if (tId == 255) + HIST_SMEM_NO_BANK_CONFLICT(tId) = tmp + hist[tId]; + __syncthreads(); + + // Step3 - Scale values to build lut + + lut[tId] = saturate_cast(HIST_SMEM_NO_BANK_CONFLICT(tId) * (255.0f / (size - minNonZeroVal))); + +#undef HIST_SMEM_NO_BANK_CONFLICT + } + + void buildLut(PtrStepSzi hist, PtrStepSzb lut, int size, cudaStream_t stream) + { + buildLutKernel<<<1, 256, 0, stream>>>(hist.data, lut.data, size); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } } diff --git a/modules/cudaimgproc/src/histogram.cpp b/modules/cudaimgproc/src/histogram.cpp index e616c5a2e9..c252abc451 100644 --- a/modules/cudaimgproc/src/histogram.cpp +++ b/modules/cudaimgproc/src/histogram.cpp @@ -102,7 +102,8 @@ void cv::cuda::calcHist(InputArray _src, InputArray _mask, OutputArray _hist, St namespace hist { - void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream); + void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const uchar* lut, cudaStream_t stream); + void buildLut(PtrStepSzi hist, PtrStepSzb lut, int size, cudaStream_t stream); } void cv::cuda::equalizeHist(InputArray _src, OutputArray _dst, Stream& _stream) @@ -114,26 +115,21 @@ void cv::cuda::equalizeHist(InputArray _src, OutputArray _dst, Stream& _stream) _dst.create(src.size(), src.type()); GpuMat dst = _dst.getGpuMat(); - int intBufSize; - nppSafeCall( nppsIntegralGetBufferSize_32s(256, &intBufSize) ); - - size_t bufSize = intBufSize + 2 * 256 * sizeof(int); + size_t bufSize = 256 * sizeof(int) + 256 * sizeof(uchar); BufferPool pool(_stream); GpuMat buf = pool.getBuffer(1, static_cast(bufSize), CV_8UC1); GpuMat hist(1, 256, CV_32SC1, buf.data); - GpuMat lut(1, 256, CV_32SC1, buf.data + 256 * sizeof(int)); - GpuMat intBuf(1, intBufSize, CV_8UC1, buf.data + 2 * 256 * sizeof(int)); + GpuMat lut(1, 256, CV_8UC1, buf.data + 256 * sizeof(int)); cuda::calcHist(src, hist, _stream); cudaStream_t stream = StreamAccessor::getStream(_stream); - NppStreamHandler h(stream); - nppSafeCall( nppsIntegral_32s(hist.ptr(), lut.ptr(), 256, intBuf.ptr()) ); + hist::buildLut(hist, lut, src.rows * src.cols, stream); - hist::equalizeHist(src, dst, lut.ptr(), stream); + hist::equalizeHist(src, dst, lut.data, stream); } //////////////////////////////////////////////////////////////////////// diff --git a/modules/cudaimgproc/test/test_histogram.cpp b/modules/cudaimgproc/test/test_histogram.cpp index 6af8eb2135..ac7c5e209f 100644 --- a/modules/cudaimgproc/test/test_histogram.cpp +++ b/modules/cudaimgproc/test/test_histogram.cpp @@ -208,7 +208,7 @@ CUDA_TEST_P(EqualizeHist, Async) cv::Mat dst_gold; cv::equalizeHist(src, dst_gold); - EXPECT_MAT_NEAR(dst_gold, dst, 3.0); + EXPECT_MAT_NEAR(dst_gold, dst, 0.0); } CUDA_TEST_P(EqualizeHist, Accuracy) @@ -221,13 +221,91 @@ CUDA_TEST_P(EqualizeHist, Accuracy) cv::Mat dst_gold; cv::equalizeHist(src, dst_gold); - EXPECT_MAT_NEAR(dst_gold, dst, 3.0); + EXPECT_MAT_NEAR(dst_gold, dst, 0.0); } INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, EqualizeHist, testing::Combine( ALL_DEVICES, DIFFERENT_SIZES)); +TEST(EqualizeHistIssue, Issue18035) +{ + std::vector imgPaths; + imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/3MP.png"); + imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/5MP.png"); + imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/airplane.png"); + imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/baboon.png"); + imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/box.png"); + imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/box_in_scene.png"); + imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/fruits.png"); + imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/fruits_ecc.png"); + imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/graffiti.png"); + imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/lena.png"); + + for (int i = 0; i < imgPaths.size(); ++i) + { + std::string imgPath = imgPaths[i]; + cv::Mat src = cv::imread(imgPath, cv::IMREAD_GRAYSCALE); + src = src / 30; + + cv::cuda::GpuMat d_src, dst; + d_src.upload(src); + cv::cuda::equalizeHist(d_src, dst); + + cv::Mat dst_gold; + cv::equalizeHist(src, dst_gold); + + EXPECT_MAT_NEAR(dst_gold, dst, 0.0); + } +} + +PARAM_TEST_CASE(EqualizeHistExtreme, cv::cuda::DeviceInfo, cv::Size, int) +{ + cv::cuda::DeviceInfo devInfo; + cv::Size size; + int val; + + virtual void SetUp() + { + devInfo = GET_PARAM(0); + size = GET_PARAM(1); + val = GET_PARAM(2); + + cv::cuda::setDevice(devInfo.deviceID()); + } +}; + +CUDA_TEST_P(EqualizeHistExtreme, Case1) +{ + cv::Mat src(size, CV_8UC1, val); + + cv::cuda::GpuMat dst; + cv::cuda::equalizeHist(loadMat(src), dst); + + cv::Mat dst_gold; + cv::equalizeHist(src, dst_gold); + + EXPECT_MAT_NEAR(dst_gold, dst, 0.0); +} + +CUDA_TEST_P(EqualizeHistExtreme, Case2) +{ + cv::Mat src = randomMat(size, CV_8UC1, val); + + cv::cuda::GpuMat dst; + cv::cuda::equalizeHist(loadMat(src), dst); + + cv::Mat dst_gold; + cv::equalizeHist(src, dst_gold); + + EXPECT_MAT_NEAR(dst_gold, dst, 0.0); +} + +INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, EqualizeHistExtreme, testing::Combine( + ALL_DEVICES, + DIFFERENT_SIZES, + testing::Range(0, 256))); + /////////////////////////////////////////////////////////////////////////////////////////////////////// // CLAHE