diff --git a/modules/gpu/perf/perf_imgproc.cpp b/modules/gpu/perf/perf_imgproc.cpp index ee4601277f..c7fb2a17c0 100644 --- a/modules/gpu/perf/perf_imgproc.cpp +++ b/modules/gpu/perf/perf_imgproc.cpp @@ -562,7 +562,17 @@ PERF_TEST_P(Sz, ImgProc_CalcHist, } else { - FAIL_NO_CPU(); + cv::Mat dst; + + const int hbins = 256; + const float hranges[] = {0.0f, 256.0f}; + const int histSize[] = {hbins}; + const float* ranges[] = {hranges}; + const int channels[] = {0}; + + TEST_CYCLE() cv::calcHist(&src, 1, channels, cv::Mat(), dst, 1, histSize, ranges); + + CPU_SANITY_CHECK(dst); } } diff --git a/modules/gpu/src/cuda/hist.cu b/modules/gpu/src/cuda/hist.cu index 8b8a1e8c63..d0ec257c47 100644 --- a/modules/gpu/src/cuda/hist.cu +++ b/modules/gpu/src/cuda/hist.cu @@ -109,6 +109,86 @@ namespace hist ///////////////////////////////////////////////////////////////////////// +namespace hist +{ + __device__ __forceinline__ void histEvenInc(int* shist, uint data, int binSize, int lowerLevel, int upperLevel) + { + if (data >= lowerLevel && data <= upperLevel) + { + const uint ind = (data - lowerLevel) / binSize; + Emulation::smem::atomicAdd(shist + ind, 1); + } + } + + __global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols, + int* hist, const int binCount, const int binSize, const int lowerLevel, const int upperLevel) + { + extern __shared__ int shist[]; + + const int y = blockIdx.x * blockDim.y + threadIdx.y; + const int tid = threadIdx.y * blockDim.x + threadIdx.x; + + if (tid < binCount) + shist[tid] = 0; + + __syncthreads(); + + if (y < rows) + { + const uchar* rowPtr = src + y * step; + const uint* rowPtr4 = (uint*) rowPtr; + + const int cols_4 = cols / 4; + for (int x = threadIdx.x; x < cols_4; x += blockDim.x) + { + const uint data = rowPtr4[x]; + + histEvenInc(shist, (data >> 0) & 0xFFU, binSize, lowerLevel, upperLevel); + histEvenInc(shist, (data >> 8) & 0xFFU, binSize, lowerLevel, upperLevel); + histEvenInc(shist, (data >> 16) & 0xFFU, binSize, lowerLevel, upperLevel); + histEvenInc(shist, (data >> 24) & 0xFFU, binSize, lowerLevel, upperLevel); + } + + if (cols % 4 != 0 && threadIdx.x == 0) + { + for (int x = cols_4 * 4; x < cols; ++x) + { + const uchar data = rowPtr[x]; + histEvenInc(shist, data, binSize, lowerLevel, upperLevel); + } + } + } + + __syncthreads(); + + if (tid < binCount) + { + const int histVal = shist[tid]; + + if (histVal > 0) + ::atomicAdd(hist + tid, histVal); + } + } + + void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream) + { + const dim3 block(32, 8); + const dim3 grid(divUp(src.rows, block.y)); + + const int binSize = divUp(upperLevel - lowerLevel, binCount); + + const size_t smem_size = binCount * sizeof(int); + + histEven8u<<>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } +} + +///////////////////////////////////////////////////////////////////////// + namespace hist { __constant__ int c_lut[256]; diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp index 8e452b4a1e..1904b6aad6 100644 --- a/modules/gpu/src/imgproc.cpp +++ b/modules/gpu/src/imgproc.cpp @@ -889,6 +889,21 @@ void cv::gpu::histEven(const GpuMat& src, GpuMat& hist, int histSize, int lowerL histEven(src, hist, buf, histSize, lowerLevel, upperLevel, stream); } +namespace hist +{ + void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream); +} + +namespace +{ + void histEven8u(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, cudaStream_t stream) + { + hist.create(1, histSize, CV_32S); + cudaSafeCall( cudaMemsetAsync(hist.data, 0, histSize * sizeof(int), stream) ); + hist::histEven8u(src, hist.ptr(), histSize, lowerLevel, upperLevel, stream); + } +} + void cv::gpu::histEven(const GpuMat& src, GpuMat& hist, GpuMat& buf, int histSize, int lowerLevel, int upperLevel, Stream& stream) { CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 ); @@ -902,6 +917,12 @@ void cv::gpu::histEven(const GpuMat& src, GpuMat& hist, GpuMat& buf, int histSiz NppHistogramEvenC1::hist }; + if (src.depth() == CV_8U && deviceSupports(FEATURE_SET_COMPUTE_30)) + { + histEven8u(src, hist, histSize, lowerLevel, upperLevel, StreamAccessor::getStream(stream)); + return; + } + hist_callers[src.depth()](src, hist, buf, histSize, lowerLevel, upperLevel, StreamAccessor::getStream(stream)); } diff --git a/modules/gpu/test/test_imgproc.cpp b/modules/gpu/test/test_imgproc.cpp index e424763bf6..811d1294cf 100644 --- a/modules/gpu/test/test_imgproc.cpp +++ b/modules/gpu/test/test_imgproc.cpp @@ -86,13 +86,16 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Integral, testing::Combine( /////////////////////////////////////////////////////////////////////////////////////////////////////// // HistEven -struct HistEven : testing::TestWithParam +PARAM_TEST_CASE(HistEven, cv::gpu::DeviceInfo, cv::Size) { cv::gpu::DeviceInfo devInfo; + cv::Size size; + virtual void SetUp() { - devInfo = GetParam(); + devInfo = GET_PARAM(0); + size = GET_PARAM(1); cv::gpu::setDevice(devInfo.deviceID()); } @@ -100,57 +103,34 @@ struct HistEven : testing::TestWithParam GPU_TEST_P(HistEven, Accuracy) { - cv::Mat img = readImage("stereobm/aloe-L.png"); - ASSERT_FALSE(img.empty()); - - cv::Mat hsv; - cv::cvtColor(img, hsv, CV_BGR2HSV); + cv::Mat src = randomMat(size, CV_8UC1); int hbins = 30; - float hranges[] = {0.0f, 180.0f}; - - std::vector srcs; - cv::gpu::split(loadMat(hsv), srcs); + float hranges[] = {50.0f, 200.0f}; cv::gpu::GpuMat hist; - cv::gpu::histEven(srcs[0], hist, hbins, (int)hranges[0], (int)hranges[1]); + cv::gpu::histEven(loadMat(src), hist, hbins, (int) hranges[0], (int) hranges[1]); + + cv::Mat hist_gold; - cv::MatND histnd; int histSize[] = {hbins}; const float* ranges[] = {hranges}; int channels[] = {0}; - cv::calcHist(&hsv, 1, channels, cv::Mat(), histnd, 1, histSize, ranges); + cv::calcHist(&src, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges); - cv::Mat hist_gold = histnd; hist_gold = hist_gold.t(); hist_gold.convertTo(hist_gold, CV_32S); EXPECT_MAT_NEAR(hist_gold, hist, 0.0); } -INSTANTIATE_TEST_CASE_P(GPU_ImgProc, HistEven, ALL_DEVICES); +INSTANTIATE_TEST_CASE_P(GPU_ImgProc, HistEven, testing::Combine( + ALL_DEVICES, + DIFFERENT_SIZES)); /////////////////////////////////////////////////////////////////////////////////////////////////////// // CalcHist -namespace -{ - void calcHistGold(const cv::Mat& src, cv::Mat& hist) - { - hist.create(1, 256, CV_32SC1); - hist.setTo(cv::Scalar::all(0)); - - int* hist_row = hist.ptr(); - for (int y = 0; y < src.rows; ++y) - { - const uchar* src_row = src.ptr(y); - - for (int x = 0; x < src.cols; ++x) - ++hist_row[src_row[x]]; - } - } -} - PARAM_TEST_CASE(CalcHist, cv::gpu::DeviceInfo, cv::Size) { cv::gpu::DeviceInfo devInfo; @@ -174,7 +154,16 @@ GPU_TEST_P(CalcHist, Accuracy) cv::gpu::calcHist(loadMat(src), hist); cv::Mat hist_gold; - calcHistGold(src, hist_gold); + + const int hbins = 256; + const float hranges[] = {0.0f, 256.0f}; + const int histSize[] = {hbins}; + const float* ranges[] = {hranges}; + const int channels[] = {0}; + + cv::calcHist(&src, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges); + hist_gold = hist_gold.reshape(1, 1); + hist_gold.convertTo(hist_gold, CV_32S); EXPECT_MAT_NEAR(hist_gold, hist, 0.0); }