From a28cb99e885aa71503c35e2f26427832c2ead121 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Thu, 1 Aug 2013 16:51:52 +0400
Subject: [PATCH] optimized version of histEven for CV_8UC1

---
 modules/gpu/src/cuda/hist.cu      | 80 +++++++++++++++++++++++++++++++
 modules/gpu/src/imgproc.cpp       | 17 ++++++-
 modules/gpu/test/test_imgproc.cpp |  4 +-
 3 files changed, 98 insertions(+), 3 deletions(-)

diff --git a/modules/gpu/src/cuda/hist.cu b/modules/gpu/src/cuda/hist.cu
index 8b8a1e8c63..d9ba559f9c 100644
--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
@@ -109,6 +109,86 @@ namespace hist
 
 /////////////////////////////////////////////////////////////////////////
 
+namespace hist
+{
+    __device__ __forceinline__ void histEvenInc(int* shist, uint data, int binSize, int lowerLevel, int upperLevel)
+    {
+        if (data >= lowerLevel && data <= upperLevel)
+        {
+            const uint ind = (data - lowerLevel) / binSize;
+            Emulation::smem::atomicAdd(shist + ind, 1);
+        }
+    }
+
+    __global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols, 
+                               int* hist, const int binCount, const int binSize, const int lowerLevel, const int upperLevel)
+    {
+        extern __shared__ int shist[];
+
+        const int y = blockIdx.x * blockDim.y + threadIdx.y;
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        if (tid < binCount)
+            shist[tid] = 0;
+
+        __syncthreads();
+
+        if (y < rows)
+        {
+            const uchar* rowPtr = src + y * step;
+            const uint* rowPtr4 = (uint*) rowPtr;
+
+            const int cols_4 = cols / 4;
+            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
+            {
+                const uint data = rowPtr4[x];
+
+                histEvenInc(shist, (data >>  0) & 0xFFU, binSize, lowerLevel, upperLevel);
+                histEvenInc(shist, (data >>  8) & 0xFFU, binSize, lowerLevel, upperLevel);
+                histEvenInc(shist, (data >> 16) & 0xFFU, binSize, lowerLevel, upperLevel);
+                histEvenInc(shist, (data >> 24) & 0xFFU, binSize, lowerLevel, upperLevel);
+            }
+
+            if (cols % 4 != 0 && threadIdx.x == 0)
+            {
+                for (int x = cols_4 * 4; x < cols; ++x)
+                {
+                    const uchar data = rowPtr[x];
+                    histEvenInc(shist, data, binSize, lowerLevel, upperLevel);
+                }
+            }
+        }
+
+        __syncthreads();
+
+        if (tid < binCount)
+        {
+            const int histVal = shist[tid];
+
+            if (histVal > 0)
+                ::atomicAdd(hist + tid, histVal);
+        }
+    }
+
+    void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.rows, block.y));
+
+        const int binSize = divUp(upperLevel - lowerLevel, binCount);
+
+        const size_t smem_size = binCount * sizeof(int);
+
+        histEven8u<<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////
+
 namespace hist
 {
     __constant__ int c_lut[256];
diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp
index 8e452b4a1e..23523630ba 100644
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -889,6 +889,21 @@ void cv::gpu::histEven(const GpuMat& src, GpuMat& hist, int histSize, int lowerL
     histEven(src, hist, buf, histSize, lowerLevel, upperLevel, stream);
 }
 
+namespace hist
+{
+    void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream);
+}
+
+namespace
+{
+    void histEven8u(const GpuMat& src, GpuMat& hist, GpuMat&, int histSize, int lowerLevel, int upperLevel, cudaStream_t stream)
+    {
+        hist.create(1, histSize, CV_32S);
+        cudaSafeCall( cudaMemsetAsync(hist.data, 0, histSize * sizeof(int), stream) );
+        hist::histEven8u(src, hist.ptr<int>(), histSize, lowerLevel, upperLevel, stream);
+    }
+}
+
 void cv::gpu::histEven(const GpuMat& src, GpuMat& hist, GpuMat& buf, int histSize, int lowerLevel, int upperLevel, Stream& stream)
 {
     CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 );
@@ -896,7 +911,7 @@ void cv::gpu::histEven(const GpuMat& src, GpuMat& hist, GpuMat& buf, int histSiz
     typedef void (*hist_t)(const GpuMat& src, GpuMat& hist, GpuMat& buf, int levels, int lowerLevel, int upperLevel, cudaStream_t stream);
     static const hist_t hist_callers[] =
     {
-        NppHistogramEvenC1<CV_8U , nppiHistogramEven_8u_C1R , nppiHistogramEvenGetBufferSize_8u_C1R >::hist,
+        histEven8u,
         0,
         NppHistogramEvenC1<CV_16U, nppiHistogramEven_16u_C1R, nppiHistogramEvenGetBufferSize_16u_C1R>::hist,
         NppHistogramEvenC1<CV_16S, nppiHistogramEven_16s_C1R, nppiHistogramEvenGetBufferSize_16s_C1R>::hist
diff --git a/modules/gpu/test/test_imgproc.cpp b/modules/gpu/test/test_imgproc.cpp
index a67760ea53..811d1294cf 100644
--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
@@ -105,8 +105,8 @@ GPU_TEST_P(HistEven, Accuracy)
 {
     cv::Mat src = randomMat(size, CV_8UC1);
 
-    int hbins = 256;
-    float hranges[] = {0.0f, 256.0f};
+    int hbins = 30;
+    float hranges[] = {50.0f, 200.0f};
 
     cv::gpu::GpuMat hist;
     cv::gpu::histEven(loadMat(src), hist, hbins, (int) hranges[0], (int) hranges[1]);