optimized cv::calcHist

pull/2840/head
Ilya Lavrenov 11 years ago
parent 071daa1d8f
commit c072c28e28
  1. 21
      modules/imgproc/src/histogram.cpp
  2. 83
      modules/imgproc/src/opencl/histogram.cl

@ -1477,14 +1477,18 @@ enum
static bool ocl_calcHist1(InputArray _src, OutputArray _hist, int ddepth = CV_32S) static bool ocl_calcHist1(InputArray _src, OutputArray _hist, int ddepth = CV_32S)
{ {
int compunits = ocl::Device::getDefault().maxComputeUnits(); const ocl::Device & dev = ocl::Device::getDefault();
size_t wgs = ocl::Device::getDefault().maxWorkGroupSize(); int compunits = dev.maxComputeUnits();
size_t wgs = dev.maxWorkGroupSize();
Size size = _src.size(); Size size = _src.size();
bool use16 = size.width % 16 == 0 && _src.offset() % 16 == 0 && _src.step() % 16 == 0; bool use16 = size.width % 16 == 0 && _src.offset() % 16 == 0 && _src.step() % 16 == 0;
int kercn = dev.isAMD() && use16 ? 16 : std::min(4, ocl::predictOptimalVectorWidth(_src));
ocl::Kernel k1("calculate_histogram", ocl::imgproc::histogram_oclsrc, ocl::Kernel k1("calculate_histogram", ocl::imgproc::histogram_oclsrc,
format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d -D cn=%d", format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d -D kercn=%d -D T=%s%s",
BINS, compunits, wgs, use16 ? 16 : 1)); BINS, compunits, wgs, kercn,
kercn == 4 ? "int" : ocl::typeToStr(CV_8UC(kercn)),
_src.isContinuous() ? " -D HAVE_SRC_CONT" : ""));
if (k1.empty()) if (k1.empty())
return false; return false;
@ -1492,18 +1496,21 @@ static bool ocl_calcHist1(InputArray _src, OutputArray _hist, int ddepth = CV_32
UMat src = _src.getUMat(), ghist(1, BINS * compunits, CV_32SC1), UMat src = _src.getUMat(), ghist(1, BINS * compunits, CV_32SC1),
hist = ddepth == CV_32S ? _hist.getUMat() : UMat(BINS, 1, CV_32SC1); hist = ddepth == CV_32S ? _hist.getUMat() : UMat(BINS, 1, CV_32SC1);
k1.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::PtrWriteOnly(ghist), (int)src.total()); k1.args(ocl::KernelArg::ReadOnly(src),
ocl::KernelArg::PtrWriteOnly(ghist), (int)src.total());
size_t globalsize = compunits * wgs; size_t globalsize = compunits * wgs;
if (!k1.run(1, &globalsize, &wgs, false)) if (!k1.run(1, &globalsize, &wgs, false))
return false; return false;
ocl::Kernel k2("merge_histogram", ocl::imgproc::histogram_oclsrc, ocl::Kernel k2("merge_histogram", ocl::imgproc::histogram_oclsrc,
format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d", BINS, compunits, (int)wgs)); format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d",
BINS, compunits, (int)wgs));
if (k2.empty()) if (k2.empty())
return false; return false;
k2.args(ocl::KernelArg::PtrReadOnly(ghist), ocl::KernelArg::PtrWriteOnly(hist)); k2.args(ocl::KernelArg::PtrReadOnly(ghist),
ocl::KernelArg::PtrWriteOnly(hist));
if (!k2.run(1, &wgs, &wgs, false)) if (!k2.run(1, &wgs, &wgs, false))
return false; return false;

@ -37,58 +37,78 @@
// //
// //
#ifndef cn #ifndef kercn
#define cn 1 #define kercn 1
#endif #endif
#if cn == 16 #ifndef T
#define T uchar16
#else
#define T uchar #define T uchar
#endif #endif
__kernel void calculate_histogram(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols, __kernel void calculate_histogram(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
__global uchar * hist, int total) __global uchar * histptr, int total)
{ {
int lid = get_local_id(0); int lid = get_local_id(0);
int id = get_global_id(0) * cn; int id = get_global_id(0) * kercn;
int gid = get_group_id(0); int gid = get_group_id(0);
__local int localhist[BINS]; __local int localhist[BINS];
#pragma unroll
for (int i = lid; i < BINS; i += WGS) for (int i = lid; i < BINS; i += WGS)
localhist[i] = 0; localhist[i] = 0;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (int grain = HISTS_COUNT * WGS * cn; id < total; id += grain) int src_index;
for (int grain = HISTS_COUNT * WGS * kercn; id < total; id += grain)
{ {
int src_index = mad24(id / src_cols, src_step, src_offset + id % src_cols); #ifdef HAVE_SRC_CONT
#if cn == 1 src_index = id;
atomic_inc(localhist + convert_int(src[src_index]));
#else #else
src_index = mad24(id / src_cols, src_step, src_offset + id % src_cols);
#endif
#if kercn == 1
atomic_inc(localhist + convert_int(src[src_index]));
#elif kercn == 4
int value = *(__global const int *)(src + src_index);
atomic_inc(localhist + (value & 0xff));
atomic_inc(localhist + ((value >> 8) & 0xff));
atomic_inc(localhist + ((value >> 16) & 0xff));
atomic_inc(localhist + ((value >> 24) & 0xff));
#elif kercn >= 2
T value = *(__global const T *)(src + src_index); T value = *(__global const T *)(src + src_index);
atomic_inc(localhist + convert_int(value.s0)); atomic_inc(localhist + value.s0);
atomic_inc(localhist + convert_int(value.s1)); atomic_inc(localhist + value.s1);
atomic_inc(localhist + convert_int(value.s2)); #if kercn >= 4
atomic_inc(localhist + convert_int(value.s3)); atomic_inc(localhist + value.s2);
atomic_inc(localhist + convert_int(value.s4)); atomic_inc(localhist + value.s3);
atomic_inc(localhist + convert_int(value.s5)); #if kercn >= 8
atomic_inc(localhist + convert_int(value.s6)); atomic_inc(localhist + value.s4);
atomic_inc(localhist + convert_int(value.s7)); atomic_inc(localhist + value.s5);
atomic_inc(localhist + convert_int(value.s8)); atomic_inc(localhist + value.s6);
atomic_inc(localhist + convert_int(value.s9)); atomic_inc(localhist + value.s7);
atomic_inc(localhist + convert_int(value.sA)); #if kercn == 16
atomic_inc(localhist + convert_int(value.sB)); atomic_inc(localhist + value.s8);
atomic_inc(localhist + convert_int(value.sC)); atomic_inc(localhist + value.s9);
atomic_inc(localhist + convert_int(value.sD)); atomic_inc(localhist + value.sA);
atomic_inc(localhist + convert_int(value.sE)); atomic_inc(localhist + value.sB);
atomic_inc(localhist + convert_int(value.sF)); atomic_inc(localhist + value.sC);
atomic_inc(localhist + value.sD);
atomic_inc(localhist + value.sE);
atomic_inc(localhist + value.sF);
#endif
#endif
#endif
#endif #endif
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
__global int * hist = (__global int *)(histptr + gid * BINS * (int)sizeof(int));
#pragma unroll
for (int i = lid; i < BINS; i += WGS) for (int i = lid; i < BINS; i += WGS)
*(__global int *)(hist + mad24(gid, BINS * (int)sizeof(int), i * (int)sizeof(int))) = localhist[i]; hist[i] = localhist[i];
} }
__kernel void merge_histogram(__global const int * ghist, __global int * hist) __kernel void merge_histogram(__global const int * ghist, __global int * hist)
@ -97,15 +117,16 @@ __kernel void merge_histogram(__global const int * ghist, __global int * hist)
#pragma unroll #pragma unroll
for (int i = lid; i < BINS; i += WGS) for (int i = lid; i < BINS; i += WGS)
hist[i] = 0; hist[i] = ghist[i];
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
#pragma unroll #pragma unroll
for (int i = 0; i < HISTS_COUNT; ++i) for (int i = 1; i < HISTS_COUNT; ++i)
{ {
ghist += BINS;
#pragma unroll #pragma unroll
for (int j = lid; j < BINS; j += WGS) for (int j = lid; j < BINS; j += WGS)
hist[j] += ghist[mad24(i, BINS, j)]; hist[j] += ghist[j];
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
} }

Loading…
Cancel
Save