diff --git a/modules/imgproc/src/clahe.cpp b/modules/imgproc/src/clahe.cpp index 7a96da62aa..fffc194ff7 100644 --- a/modules/imgproc/src/clahe.cpp +++ b/modules/imgproc/src/clahe.cpp @@ -54,16 +54,7 @@ namespace clahe const int tilesX, const int tilesY, const cv::Size tileSize, const int clipLimit, const float lutScale) { - cv::ocl::Kernel _k("calcLut", cv::ocl::imgproc::clahe_oclsrc); - - bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU; - cv::String opts; - if(is_cpu) - opts = "-D CPU "; - else - opts = cv::format("-D WAVE_SIZE=%d", _k.preferedWorkGroupSizeMultiple()); - - cv::ocl::Kernel k("calcLut", cv::ocl::imgproc::clahe_oclsrc, opts); + cv::ocl::Kernel k("calcLut", cv::ocl::imgproc::clahe_oclsrc); if(k.empty()) return false; diff --git a/modules/imgproc/src/opencl/clahe.cl b/modules/imgproc/src/opencl/clahe.cl index ba69085634..1c806e9b6a 100644 --- a/modules/imgproc/src/opencl/clahe.cl +++ b/modules/imgproc/src/opencl/clahe.cl @@ -43,10 +43,6 @@ // //M*/ -#ifndef WAVE_SIZE -#define WAVE_SIZE 1 -#endif - inline int calc_lut(__local int* smem, int val, int tid) { smem[tid] = val; @@ -60,8 +56,7 @@ inline int calc_lut(__local int* smem, int val, int tid) return smem[tid]; } -#ifdef CPU -inline void reduce(volatile __local int* smem, int val, int tid) +inline int reduce(__local volatile int* smem, int val, int tid) { smem[tid] = val; barrier(CLK_LOCAL_MEM_FENCE); @@ -75,69 +70,39 @@ inline void reduce(volatile __local int* smem, int val, int tid) barrier(CLK_LOCAL_MEM_FENCE); if (tid < 32) + { smem[tid] += smem[tid + 32]; + } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 16) + { smem[tid] += smem[tid + 16]; + } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 8) + { smem[tid] += smem[tid + 8]; + } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 4) + { smem[tid] += smem[tid + 4]; + } barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 2) - smem[tid] += smem[tid + 2]; - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 1) - smem[256] = smem[tid] + smem[tid + 1]; - barrier(CLK_LOCAL_MEM_FENCE); -} - -#else - -inline void reduce(__local volatile int* smem, int val, int tid) -{ - smem[tid] = val; - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 128) - smem[tid] = val += smem[tid + 128]; - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 64) - smem[tid] = val += smem[tid + 64]; - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 32) - { - smem[tid] += smem[tid + 32]; -#if WAVE_SIZE < 32 - } barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 16) + if (tid == 0) { -#endif - smem[tid] += smem[tid + 16]; -#if WAVE_SIZE < 16 + smem[0] = (smem[0] + smem[1]) + (smem[2] + smem[3]); } barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 8) - { -#endif - smem[tid] += smem[tid + 8]; - smem[tid] += smem[tid + 4]; - smem[tid] += smem[tid + 2]; - smem[tid] += smem[tid + 1]; - } + val = smem[0]; + barrier(CLK_LOCAL_MEM_FENCE); + return val; } -#endif __kernel void calcLut(__global __const uchar * src, const int srcStep, const int src_offset, __global uchar * lut, @@ -179,14 +144,7 @@ __kernel void calcLut(__global __const uchar * src, const int srcStep, } // find number of overall clipped samples - reduce(smem, clipped, tid); - barrier(CLK_LOCAL_MEM_FENCE); -#ifdef CPU - clipped = smem[256]; -#else - clipped = smem[0]; -#endif - barrier(CLK_LOCAL_MEM_FENCE); + clipped = reduce(smem, clipped, tid); // redistribute clipped samples evenly int redistBatch = clipped / 256;