diff --git a/modules/imgproc/src/histogram.cpp b/modules/imgproc/src/histogram.cpp index 6fc23d2ce9..0385a187f6 100644 --- a/modules/imgproc/src/histogram.cpp +++ b/modules/imgproc/src/histogram.cpp @@ -1410,9 +1410,12 @@ static bool ocl_calcHist1(InputArray _src, OutputArray _hist, int ddepth = CV_32 { int compunits = ocl::Device::getDefault().maxComputeUnits(); size_t wgs = ocl::Device::getDefault().maxWorkGroupSize(); + Size size = _src.size(); + bool use16 = size.width % 16 == 0 && _src.offset() % 16 == 0 && _src.step() % 16 == 0; ocl::Kernel k1("calculate_histogram", ocl::imgproc::histogram_oclsrc, - format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d", BINS, compunits, wgs)); + format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d -D cn=%d", + BINS, compunits, wgs, use16 ? 16 : 1)); if (k1.empty()) return false; @@ -1420,8 +1423,7 @@ static bool ocl_calcHist1(InputArray _src, OutputArray _hist, int ddepth = CV_32 UMat src = _src.getUMat(), ghist(1, BINS * compunits, CV_32SC1), hist = ddepth == CV_32S ? _hist.getUMat() : UMat(BINS, 1, CV_32SC1); - k1.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::PtrWriteOnly(ghist), - (int)src.total()); + k1.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::PtrWriteOnly(ghist), (int)src.total()); size_t globalsize = compunits * wgs; if (!k1.run(1, &globalsize, &wgs, false)) diff --git a/modules/imgproc/src/opencl/histogram.cl b/modules/imgproc/src/opencl/histogram.cl index 481bdcf478..c0247a5ba2 100644 --- a/modules/imgproc/src/opencl/histogram.cl +++ b/modules/imgproc/src/opencl/histogram.cl @@ -37,11 +37,21 @@ // // +#ifndef cn +#define cn 1 +#endif + +#if cn == 16 +#define T uchar16 +#else +#define T uchar +#endif + __kernel void calculate_histogram(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols, __global uchar * hist, int total) { int lid = get_local_id(0); - int id = get_global_id(0); + int id = get_global_id(0) * cn; int gid = get_group_id(0); __local int localhist[BINS]; @@ -50,10 +60,30 @@ __kernel void calculate_histogram(__global const uchar * src, int src_step, int localhist[i] = 0; barrier(CLK_LOCAL_MEM_FENCE); - for (int grain = HISTS_COUNT * WGS; id < total; id += grain) + for (int grain = HISTS_COUNT * WGS * cn; id < total; id += grain) { int src_index = mad24(id / src_cols, src_step, src_offset + id % src_cols); - atomic_inc(localhist + (int)src[src_index]); +#if cn == 1 + atomic_inc(localhist + convert_int(src[src_index])); +#else + T value = *(__global const T *)(src + src_index); + atomic_inc(localhist + convert_int(value.s0)); + atomic_inc(localhist + convert_int(value.s1)); + atomic_inc(localhist + convert_int(value.s2)); + atomic_inc(localhist + convert_int(value.s3)); + atomic_inc(localhist + convert_int(value.s4)); + atomic_inc(localhist + convert_int(value.s5)); + atomic_inc(localhist + convert_int(value.s6)); + atomic_inc(localhist + convert_int(value.s7)); + atomic_inc(localhist + convert_int(value.s8)); + atomic_inc(localhist + convert_int(value.s9)); + atomic_inc(localhist + convert_int(value.sA)); + atomic_inc(localhist + convert_int(value.sB)); + atomic_inc(localhist + convert_int(value.sC)); + atomic_inc(localhist + convert_int(value.sD)); + atomic_inc(localhist + convert_int(value.sE)); + atomic_inc(localhist + convert_int(value.sF)); +#endif } barrier(CLK_LOCAL_MEM_FENCE);