diff --git a/modules/imgproc/src/clahe.cpp b/modules/imgproc/src/clahe.cpp
index 7a96da62aa..fffc194ff7 100644
--- a/modules/imgproc/src/clahe.cpp
+++ b/modules/imgproc/src/clahe.cpp
@@ -54,16 +54,7 @@ namespace clahe
         const int tilesX, const int tilesY, const cv::Size tileSize,
         const int clipLimit, const float lutScale)
     {
-        cv::ocl::Kernel _k("calcLut", cv::ocl::imgproc::clahe_oclsrc);
-
-        bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
-        cv::String opts;
-        if(is_cpu)
-            opts = "-D CPU ";
-        else
-            opts = cv::format("-D WAVE_SIZE=%d", _k.preferedWorkGroupSizeMultiple());
-
-        cv::ocl::Kernel k("calcLut", cv::ocl::imgproc::clahe_oclsrc, opts);
+        cv::ocl::Kernel k("calcLut", cv::ocl::imgproc::clahe_oclsrc);
         if(k.empty())
             return false;
 
diff --git a/modules/imgproc/src/opencl/clahe.cl b/modules/imgproc/src/opencl/clahe.cl
index ba69085634..1c806e9b6a 100644
--- a/modules/imgproc/src/opencl/clahe.cl
+++ b/modules/imgproc/src/opencl/clahe.cl
@@ -43,10 +43,6 @@
 //
 //M*/
 
-#ifndef WAVE_SIZE
-#define WAVE_SIZE 1
-#endif
-
 inline int calc_lut(__local int* smem, int val, int tid)
 {
     smem[tid] = val;
@@ -60,8 +56,7 @@ inline int calc_lut(__local int* smem, int val, int tid)
     return smem[tid];
 }
 
-#ifdef CPU
-inline void reduce(volatile __local int* smem, int val, int tid)
+inline int reduce(__local volatile int* smem, int val, int tid)
 {
     smem[tid] = val;
     barrier(CLK_LOCAL_MEM_FENCE);
@@ -75,69 +70,39 @@ inline void reduce(volatile __local int* smem, int val, int tid)
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 32)
+    {
         smem[tid] += smem[tid + 32];
+    }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 16)
+    {
         smem[tid] += smem[tid + 16];
+    }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 8)
+    {
         smem[tid] += smem[tid + 8];
+    }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 4)
+    {
         smem[tid] += smem[tid + 4];
+    }
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    if (tid < 2)
-        smem[tid] += smem[tid + 2];
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 1)
-        smem[256] = smem[tid] + smem[tid + 1];
-    barrier(CLK_LOCAL_MEM_FENCE);
-}
-
-#else
-
-inline void reduce(__local volatile int* smem, int val, int tid)
-{
-    smem[tid] = val;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 128)
-        smem[tid] = val += smem[tid + 128];
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 64)
-        smem[tid] = val += smem[tid + 64];
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 32)
-    {
-        smem[tid] += smem[tid + 32];
-#if WAVE_SIZE < 32
-    } barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 16)
+    if (tid == 0)
     {
-#endif
-        smem[tid] += smem[tid + 16];
-#if WAVE_SIZE < 16
+        smem[0] = (smem[0] + smem[1]) + (smem[2] + smem[3]);
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    if (tid < 8)
-    {
-#endif
-        smem[tid] += smem[tid + 8];
-        smem[tid] += smem[tid + 4];
-        smem[tid] += smem[tid + 2];
-        smem[tid] += smem[tid + 1];
-    }
+    val = smem[0];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return val;
 }
-#endif
 
 __kernel void calcLut(__global __const uchar * src, const int srcStep,
                       const int src_offset, __global uchar * lut,
@@ -179,14 +144,7 @@ __kernel void calcLut(__global __const uchar * src, const int srcStep,
         }
 
         // find number of overall clipped samples
-        reduce(smem, clipped, tid);
-        barrier(CLK_LOCAL_MEM_FENCE);
-#ifdef CPU
-        clipped = smem[256];
-#else
-        clipped = smem[0];
-#endif
-        barrier(CLK_LOCAL_MEM_FENCE);
+        clipped = reduce(smem, clipped, tid);
 
         // redistribute clipped samples evenly
         int redistBatch = clipped / 256;