optimized cv::meanStdDev

pull/2781/head
Ilya Lavrenov 11 years ago
parent d940093fb5
commit 33173d900a
  1. 14
      modules/core/src/ocl.cpp
  2. 129
      modules/core/src/opencl/meanstddev.cl
  3. 76
      modules/core/src/stat.cpp

@ -4419,22 +4419,22 @@ int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
InputArray src4, InputArray src5, InputArray src6, InputArray src4, InputArray src5, InputArray src6,
InputArray src7, InputArray src8, InputArray src9) InputArray src7, InputArray src8, InputArray src9)
{ {
int type = src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz = CV_ELEM_SIZE(depth); int type = src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz1 = CV_ELEM_SIZE1(depth);
Size ssize = src1.size(); Size ssize = src1.size();
const ocl::Device & d = ocl::Device::getDefault(); const ocl::Device & d = ocl::Device::getDefault();
int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(), int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(),
d.preferredVectorWidthShort(), d.preferredVectorWidthShort(), d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(), d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
d.preferredVectorWidthDouble(), -1 }, width = vectorWidths[depth]; d.preferredVectorWidthDouble(), -1 }, kercn = vectorWidths[depth];
if (d.isIntel()) if (d.isIntel())
{ {
// it's heuristic // it's heuristic
int vectorWidthsIntel[] = { 16, 16, 8, 8, 1, 1, 1, -1 }; int vectorWidthsIntel[] = { 16, 16, 8, 8, 1, 1, 1, -1 };
width = vectorWidthsIntel[depth]; kercn = vectorWidthsIntel[depth];
} }
if (ssize.width * cn < width || width <= 0) if (ssize.width * cn < kercn || kercn <= 0)
return 1; return 1;
std::vector<size_t> offsets, steps, cols; std::vector<size_t> offsets, steps, cols;
@ -4449,7 +4449,7 @@ int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
PROCESS_SRC(src9); PROCESS_SRC(src9);
size_t size = offsets.size(); size_t size = offsets.size();
int wsz = width * esz; int wsz = kercn * esz1;
std::vector<int> dividers(size, wsz); std::vector<int> dividers(size, wsz);
for (size_t i = 0; i < size; ++i) for (size_t i = 0; i < size; ++i)
@ -4460,14 +4460,14 @@ int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
for (size_t i = 0; i < size; ++i) for (size_t i = 0; i < size; ++i)
if (dividers[i] != wsz) if (dividers[i] != wsz)
{ {
width = 1; kercn = 1;
break; break;
} }
// another strategy // another strategy
// width = *std::min_element(dividers.begin(), dividers.end()); // width = *std::min_element(dividers.begin(), dividers.end());
return width; return kercn;
} }
#undef PROCESS_SRC #undef PROCESS_SRC

@ -0,0 +1,129 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
// Copyright (C) 2014, Itseez, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
#ifdef DOUBLE_SUPPORT
#ifdef cl_amd_fp64
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
#endif
#define noconvert
#if cn != 3
#define loadpix(addr) *(__global const srcT *)(addr)
#define storepix(val, addr) *(__global dstT *)(addr) = val
#define storesqpix(val, addr) *(__global sqdstT *)(addr) = val
#define srcTSIZE (int)sizeof(srcT)
#define dstTSIZE (int)sizeof(dstT)
#define sqdstTSIZE (int)sizeof(sqdstT)
#else
#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))
#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))
#define storesqpix(val, addr) vstore3(val, 0, (__global sqdstT1 *)(addr))
#define srcTSIZE ((int)sizeof(srcT1)*3)
#define dstTSIZE ((int)sizeof(dstT1)*3)
#define sqdstTSIZE ((int)sizeof(sqdstT1)*3)
#endif
__kernel void meanStdDev(__global const uchar * srcptr, int src_step, int src_offset, int cols,
int total, int groups, __global uchar * dstptr
#ifdef HAVE_MASK
, __global const uchar * mask, int mask_step, int mask_offset
#endif
)
{
int lid = get_local_id(0);
int gid = get_group_id(0);
int id = get_global_id(0);
__local dstT localMemSum[WGS2_ALIGNED];
__local sqdstT localMemSqSum[WGS2_ALIGNED];
#ifdef HAVE_MASK
__local int localMemNonZero[WGS2_ALIGNED];
#endif
dstT accSum = (dstT)(0);
sqdstT accSqSum = (sqdstT)(0);
#ifdef HAVE_MASK
int accNonZero = 0;
mask += mask_offset;
#endif
srcptr += src_offset;
for (int grain = groups * WGS; id < total; id += grain)
{
#ifdef HAVE_MASK
#ifdef HAVE_SRC_CONT
int mask_index = id;
#else
int mask_index = mad24(id / cols, mask_step, id % cols);
#endif
if (mask[mask_index])
#endif
{
#ifdef HAVE_SRC_CONT
int src_index = mul24(id, srcTSIZE);
#else
int src_index = mad24(id / cols, src_step, mul24(id % cols, srcTSIZE));
#endif
srcT value = loadpix(srcptr + src_index);
accSum += convertToDT(value);
sqdstT dvalue = convertToSDT(value);
accSqSum = fma(dvalue, dvalue, accSqSum);
#ifdef HAVE_MASK
++accNonZero;
#endif
}
}
if (lid < WGS2_ALIGNED)
{
localMemSum[lid] = accSum;
localMemSqSum[lid] = accSqSum;
#ifdef HAVE_MASK
localMemNonZero[lid] = accNonZero;
#endif
}
barrier(CLK_LOCAL_MEM_FENCE);
if (lid >= WGS2_ALIGNED && total >= WGS2_ALIGNED)
{
localMemSum[lid - WGS2_ALIGNED] += accSum;
localMemSqSum[lid - WGS2_ALIGNED] += accSqSum;
#ifdef HAVE_MASK
localMemNonZero[lid - WGS2_ALIGNED] += accNonZero;
#endif
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int lsize = WGS2_ALIGNED >> 1; lsize > 0; lsize >>= 1)
{
if (lid < lsize)
{
int lid2 = lsize + lid;
localMemSum[lid] += localMemSum[lid2];
localMemSqSum[lid] += localMemSqSum[lid2];
#ifdef HAVE_MASK
localMemNonZero[lid] += localMemNonZero[lid2];
#endif
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (lid == 0)
{
storepix(localMemSum[0], dstptr + dstTSIZE * gid);
storesqpix(localMemSqSum[0], dstptr + mad24(dstTSIZE, groups, sqdstTSIZE * gid));
#ifdef HAVE_MASK
*(__global int *)(dstptr + mad24(dstTSIZE + sqdstTSIZE, groups, (int)sizeof(int) * gid)) = localMemNonZero[0];
#endif
}
}

@ -878,14 +878,76 @@ namespace cv {
static bool ocl_meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, InputArray _mask ) static bool ocl_meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, InputArray _mask )
{ {
bool haveMask = _mask.kind() != _InputArray::NONE; bool haveMask = _mask.kind() != _InputArray::NONE;
int nz = haveMask ? -1 : (int)_src.total();
Scalar mean, stddev; Scalar mean, stddev;
if (!ocl_sum(_src, mean, OCL_OP_SUM, _mask))
return false;
if (!ocl_sum(_src, stddev, OCL_OP_SUM_SQR, _mask))
return false;
int nz = haveMask ? countNonZero(_mask) : (int)_src.total(); {
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
isContinuous = _src.isContinuous();
int groups = ocl::Device::getDefault().maxComputeUnits();
size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
int ddepth = std::max(CV_32S, depth), sqddepth = std::max(CV_32F, depth),
dtype = CV_MAKE_TYPE(ddepth, cn),
sqdtype = CV_MAKETYPE(sqddepth, cn);
CV_Assert(!haveMask || _mask.type() == CV_8UC1);
int wgs2_aligned = 1;
while (wgs2_aligned < (int)wgs)
wgs2_aligned <<= 1;
wgs2_aligned >>= 1;
if ( (!doubleSupport && depth == CV_64F) || cn > 4 )
return false;
char cvt[2][40];
String opts = format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D sqddepth=%d"
" -D sqdstT=%s -D sqdstT1=%s -D convertToSDT=%s -D cn=%d%s"
" -D convertToDT=%s -D WGS=%d -D WGS2_ALIGNED=%d%s%s",
ocl::typeToStr(type), ocl::typeToStr(depth),
ocl::typeToStr(dtype), ocl::typeToStr(ddepth), sqddepth,
ocl::typeToStr(sqdtype), ocl::typeToStr(sqddepth),
ocl::convertTypeStr(depth, sqddepth, cn, cvt[0]),
cn, isContinuous ? " -D HAVE_SRC_CONT" : "",
ocl::convertTypeStr(depth, ddepth, cn, cvt[1]),
(int)wgs, wgs2_aligned, haveMask ? " -D HAVE_MASK" : "",
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
ocl::Kernel k("meanStdDev", ocl::core::meanstddev_oclsrc, opts);
if (k.empty())
return false;
int dbsize = groups * ((haveMask ? CV_ELEM_SIZE1(CV_32S) : 0) +
CV_ELEM_SIZE(sqdtype) + CV_ELEM_SIZE(dtype));
UMat src = _src.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat();
ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
dbarg = ocl::KernelArg::PtrWriteOnly(db),
maskarg = ocl::KernelArg::ReadOnlyNoSize(mask);
if (haveMask)
k.args(srcarg, src.cols, (int)src.total(), groups, dbarg, maskarg);
else
k.args(srcarg, src.cols, (int)src.total(), groups, dbarg);
size_t globalsize = groups * wgs;
if (!k.run(1, &globalsize, &wgs, false))
return false;
typedef Scalar (* part_sum)(Mat m);
part_sum funcs[3] = { ocl_part_sum<int>, ocl_part_sum<float>, ocl_part_sum<double> };
Mat dbm = db.getMat(ACCESS_READ);
mean = funcs[ddepth - CV_32S](Mat(1, groups, dtype, dbm.data));
stddev = funcs[sqddepth - CV_32S](Mat(1, groups, sqdtype, dbm.data + groups * CV_ELEM_SIZE(dtype)));
if (haveMask)
nz = saturate_cast<int>(funcs[0](Mat(1, groups, CV_32SC1, dbm.data +
groups * (CV_ELEM_SIZE(dtype) +
CV_ELEM_SIZE(sqdtype))))[0]);
}
double total = nz != 0 ? 1.0 / nz : 0; double total = nz != 0 ? 1.0 / nz : 0;
int k, j, cn = _src.channels(); int k, j, cn = _src.channels();
for (int i = 0; i < cn; ++i) for (int i = 0; i < cn; ++i)
@ -927,7 +989,7 @@ void cv::meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, Input
ocl_meanStdDev(_src, _mean, _sdv, _mask)) ocl_meanStdDev(_src, _mean, _sdv, _mask))
Mat src = _src.getMat(), mask = _mask.getMat(); Mat src = _src.getMat(), mask = _mask.getMat();
CV_Assert( mask.empty() || mask.type() == CV_8U ); CV_Assert( mask.empty() || mask.type() == CV_8UC1 );
int k, cn = src.channels(), depth = src.depth(); int k, cn = src.channels(), depth = src.depth();

Loading…
Cancel
Save