Merge pull request #11409 from tomoaki0705/fixCLAHEfailure

Arm: fix the test failure of OCL_Imgproc/CLAHETest.Accuracy on ODROID-XU4 (#11409)

* fix the test failure of OCL_Imgproc/CLAHETest.Accuracy on ODROID-XU4
  * avoid the race condition in the reduce

* imgproc(ocl): simplify CLAHE code

* remove unused class
pull/11406/head
Tomoaki Teshima 7 years ago committed by Alexander Alekhin
parent 71d406b40c
commit 87a4f4ab3a
  1. 11
      modules/imgproc/src/clahe.cpp
  2. 72
      modules/imgproc/src/opencl/clahe.cl

@ -54,16 +54,7 @@ namespace clahe
const int tilesX, const int tilesY, const cv::Size tileSize, const int tilesX, const int tilesY, const cv::Size tileSize,
const int clipLimit, const float lutScale) const int clipLimit, const float lutScale)
{ {
cv::ocl::Kernel _k("calcLut", cv::ocl::imgproc::clahe_oclsrc); cv::ocl::Kernel k("calcLut", cv::ocl::imgproc::clahe_oclsrc);
bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
cv::String opts;
if(is_cpu)
opts = "-D CPU ";
else
opts = cv::format("-D WAVE_SIZE=%d", _k.preferedWorkGroupSizeMultiple());
cv::ocl::Kernel k("calcLut", cv::ocl::imgproc::clahe_oclsrc, opts);
if(k.empty()) if(k.empty())
return false; return false;

@ -43,10 +43,6 @@
// //
//M*/ //M*/
#ifndef WAVE_SIZE
#define WAVE_SIZE 1
#endif
inline int calc_lut(__local int* smem, int val, int tid) inline int calc_lut(__local int* smem, int val, int tid)
{ {
smem[tid] = val; smem[tid] = val;
@ -60,8 +56,7 @@ inline int calc_lut(__local int* smem, int val, int tid)
return smem[tid]; return smem[tid];
} }
#ifdef CPU inline int reduce(__local volatile int* smem, int val, int tid)
inline void reduce(volatile __local int* smem, int val, int tid)
{ {
smem[tid] = val; smem[tid] = val;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -75,69 +70,39 @@ inline void reduce(volatile __local int* smem, int val, int tid)
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 32) if (tid < 32)
{
smem[tid] += smem[tid + 32]; smem[tid] += smem[tid + 32];
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16) if (tid < 16)
{
smem[tid] += smem[tid + 16]; smem[tid] += smem[tid + 16];
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8) if (tid < 8)
{
smem[tid] += smem[tid + 8]; smem[tid] += smem[tid + 8];
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4) if (tid < 4)
{
smem[tid] += smem[tid + 4]; smem[tid] += smem[tid + 4];
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2) if (tid == 0)
smem[tid] += smem[tid + 2];
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1)
smem[256] = smem[tid] + smem[tid + 1];
barrier(CLK_LOCAL_MEM_FENCE);
}
#else
inline void reduce(__local volatile int* smem, int val, int tid)
{
smem[tid] = val;
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 128)
smem[tid] = val += smem[tid + 128];
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 64)
smem[tid] = val += smem[tid + 64];
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 32)
{
smem[tid] += smem[tid + 32];
#if WAVE_SIZE < 32
} barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16)
{ {
#endif smem[0] = (smem[0] + smem[1]) + (smem[2] + smem[3]);
smem[tid] += smem[tid + 16];
#if WAVE_SIZE < 16
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8) val = smem[0];
{ barrier(CLK_LOCAL_MEM_FENCE);
#endif return val;
smem[tid] += smem[tid + 8];
smem[tid] += smem[tid + 4];
smem[tid] += smem[tid + 2];
smem[tid] += smem[tid + 1];
}
} }
#endif
__kernel void calcLut(__global __const uchar * src, const int srcStep, __kernel void calcLut(__global __const uchar * src, const int srcStep,
const int src_offset, __global uchar * lut, const int src_offset, __global uchar * lut,
@ -179,14 +144,7 @@ __kernel void calcLut(__global __const uchar * src, const int srcStep,
} }
// find number of overall clipped samples // find number of overall clipped samples
reduce(smem, clipped, tid); clipped = reduce(smem, clipped, tid);
barrier(CLK_LOCAL_MEM_FENCE);
#ifdef CPU
clipped = smem[256];
#else
clipped = smem[0];
#endif
barrier(CLK_LOCAL_MEM_FENCE);
// redistribute clipped samples evenly // redistribute clipped samples evenly
int redistBatch = clipped / 256; int redistBatch = clipped / 256;

Loading…
Cancel
Save