From a28cb99e885aa71503c35e2f26427832c2ead121 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Thu, 1 Aug 2013 16:51:52 +0400 Subject: [PATCH] optimized version of histEven for CV_8UC1 --- modules/gpu/src/cuda/hist.cu | 80 +++++++++++++++++++++++++++++++ modules/gpu/src/imgproc.cpp | 17 ++++++- modules/gpu/test/test_imgproc.cpp | 4 +- 3 files changed, 98 insertions(+), 3 deletions(-) diff --git a/modules/gpu/src/cuda/hist.cu b/modules/gpu/src/cuda/hist.cu index 8b8a1e8c63..d9ba559f9c 100644 --- a/modules/gpu/src/cuda/hist.cu +++ b/modules/gpu/src/cuda/hist.cu @@ -109,6 +109,86 @@ namespace hist ///////////////////////////////////////////////////////////////////////// +namespace hist +{ + __device__ __forceinline__ void histEvenInc(int* shist, uint data, int binSize, int lowerLevel, int upperLevel) + { + if (data >= lowerLevel && data <= upperLevel) + { + const uint ind = (data - lowerLevel) / binSize; + Emulation::smem::atomicAdd(shist + ind, 1); + } + } + + __global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols, + int* hist, const int binCount, const int binSize, const int lowerLevel, const int upperLevel) + { + extern __shared__ int shist[]; + + const int y = blockIdx.x * blockDim.y + threadIdx.y; + const int tid = threadIdx.y * blockDim.x + threadIdx.x; + + if (tid < binCount) + shist[tid] = 0; + + __syncthreads(); + + if (y < rows) + { + const uchar* rowPtr = src + y * step; + const uint* rowPtr4 = (uint*) rowPtr; + + const int cols_4 = cols / 4; + for (int x = threadIdx.x; x < cols_4; x += blockDim.x) + { + const uint data = rowPtr4[x]; + + histEvenInc(shist, (data >> 0) & 0xFFU, binSize, lowerLevel, upperLevel); + histEvenInc(shist, (data >> 8) & 0xFFU, binSize, lowerLevel, upperLevel); + histEvenInc(shist, (data >> 16) & 0xFFU, binSize, lowerLevel, upperLevel); + histEvenInc(shist, (data >> 24) & 0xFFU, binSize, lowerLevel, upperLevel); + } + + if (cols % 4 != 0 && threadIdx.x == 0) + { + for (int x = cols_4 * 4; x < cols; ++x) + { + const uchar data = rowPtr[x]; + histEvenInc(shist, data, binSize, lowerLevel, upperLevel); + } + } + } + + __syncthreads(); + + if (tid < binCount) + { + const int histVal = shist[tid]; + + if (histVal > 0) + ::atomicAdd(hist + tid, histVal); + } + } + + void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream) + { + const dim3 block(32, 8); + const dim3 grid(divUp(src.rows, block.y)); + + const int binSize = divUp(upperLevel - lowerLevel, binCount); + + const size_t smem_size = binCount * sizeof(int); + + histEven8u<<>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } +} + +///////////////////////////////////////////////////////////////////////// + namespace hist { __constant__ int c_lut[256]; diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp index 8e452b4a1e..23523630ba 100644 --- a/modules/gpu/src/imgproc.cpp +++ b/modules/gpu/src/imgproc.cpp @@ -889,6 +889,21 @@ void cv::gpu::histEven(const GpuMat& src, GpuMat& hist, int histSize, int lowerL histEven(src, hist, buf, histSize, lowerLevel, upperLevel, stream); } +namespace hist +{ + void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream); +} + +namespace +{ + void histEven8u(const GpuMat& src, GpuMat& hist, GpuMat&, int histSize, int lowerLevel, int upperLevel, cudaStream_t stream) + { + hist.create(1, histSize, CV_32S); + cudaSafeCall( cudaMemsetAsync(hist.data, 0, histSize * sizeof(int), stream) ); + hist::histEven8u(src, hist.ptr(), histSize, lowerLevel, upperLevel, stream); + } +} + void cv::gpu::histEven(const GpuMat& src, GpuMat& hist, GpuMat& buf, int histSize, int lowerLevel, int upperLevel, Stream& stream) { CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 ); @@ -896,7 +911,7 @@ void cv::gpu::histEven(const GpuMat& src, GpuMat& hist, GpuMat& buf, int histSiz typedef void (*hist_t)(const GpuMat& src, GpuMat& hist, GpuMat& buf, int levels, int lowerLevel, int upperLevel, cudaStream_t stream); static const hist_t hist_callers[] = { - NppHistogramEvenC1::hist, + histEven8u, 0, NppHistogramEvenC1::hist, NppHistogramEvenC1::hist diff --git a/modules/gpu/test/test_imgproc.cpp b/modules/gpu/test/test_imgproc.cpp index a67760ea53..811d1294cf 100644 --- a/modules/gpu/test/test_imgproc.cpp +++ b/modules/gpu/test/test_imgproc.cpp @@ -105,8 +105,8 @@ GPU_TEST_P(HistEven, Accuracy) { cv::Mat src = randomMat(size, CV_8UC1); - int hbins = 256; - float hranges[] = {0.0f, 256.0f}; + int hbins = 30; + float hranges[] = {50.0f, 200.0f}; cv::gpu::GpuMat hist; cv::gpu::histEven(loadMat(src), hist, hbins, (int) hranges[0], (int) hranges[1]);