Merge pull request #2750 from ilya-lavrenov:tapi_convertto

pull/2820/head
Alexander Alekhin 11 years ago committed by OpenCV Buildbot
commit 607cd37b48
  1. 40
      modules/core/src/arithm.cpp
  2. 63
      modules/core/src/convert.cpp
  3. 7
      modules/core/src/dxt.cpp
  4. 54
      modules/core/src/mathfuncs.cpp
  5. 12
      modules/core/src/matmul.cpp
  6. 8
      modules/core/src/matrix.cpp
  7. 82
      modules/core/src/opencl/arithm.cl
  8. 19
      modules/core/src/opencl/convert.cl
  9. 50
      modules/core/src/opencl/inrange.cl
  10. 22
      modules/core/src/opencl/mixchannels.cl
  11. 28
      modules/core/src/opencl/mulspectrums.cl
  12. 11
      modules/core/src/opencl/set_identity.cl
  13. 52
      modules/core/src/opencl/split_merge.cl
  14. 27
      modules/core/src/stat.cpp
  15. 8
      modules/core/src/umatrix.cpp

@ -1008,7 +1008,8 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
int srcdepth = CV_MAT_DEPTH(srctype); int srcdepth = CV_MAT_DEPTH(srctype);
int cn = CV_MAT_CN(srctype); int cn = CV_MAT_CN(srctype);
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; const ocl::Device d = ocl::Device::getDefault();
bool doubleSupport = d.doubleFPConfig() > 0;
if( oclop < 0 || ((haveMask || haveScalar) && cn > 4) || if( oclop < 0 || ((haveMask || haveScalar) && cn > 4) ||
(!doubleSupport && srcdepth == CV_64F && !bitwise)) (!doubleSupport && srcdepth == CV_64F && !bitwise))
return false; return false;
@ -1016,8 +1017,9 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
char opts[1024]; char opts[1024];
int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst); int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
int scalarcn = kercn == 3 ? 4 : kercn; int scalarcn = kercn == 3 ? 4 : kercn;
int rowsPerWI = d.isIntel() ? 4 : 1;
sprintf(opts, "-D %s%s -D %s -D dstT=%s%s -D dstT_C1=%s -D workST=%s -D cn=%d", sprintf(opts, "-D %s%s -D %s -D dstT=%s%s -D dstT_C1=%s -D workST=%s -D cn=%d -D rowsPerWI=%d",
haveMask ? "MASK_" : "", haveScalar ? "UNARY_OP" : "BINARY_OP", oclop2str[oclop], haveMask ? "MASK_" : "", haveScalar ? "UNARY_OP" : "BINARY_OP", oclop2str[oclop],
bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, kercn)) : bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, kercn)) :
ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)), doubleSupport ? " -D DOUBLE_SUPPORT" : "", ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)), doubleSupport ? " -D DOUBLE_SUPPORT" : "",
@ -1025,7 +1027,7 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
ocl::typeToStr(CV_MAKETYPE(srcdepth, 1)), ocl::typeToStr(CV_MAKETYPE(srcdepth, 1)),
bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, scalarcn)) : bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, scalarcn)) :
ocl::typeToStr(CV_MAKETYPE(srcdepth, scalarcn)), ocl::typeToStr(CV_MAKETYPE(srcdepth, scalarcn)),
kercn); kercn, rowsPerWI);
ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts); ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
if (k.empty()) if (k.empty())
@ -1068,7 +1070,7 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
k.args(src1arg, src2arg, maskarg, dstarg); k.args(src1arg, src2arg, maskarg, dstarg);
} }
size_t globalsize[] = { src1.cols * cn / kercn, src1.rows }; size_t globalsize[] = { src1.cols * cn / kercn, (src1.rows + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, 0, false); return k.run(2, globalsize, 0, false);
} }
@ -1371,7 +1373,8 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
void* usrdata, int oclop, void* usrdata, int oclop,
bool haveScalar ) bool haveScalar )
{ {
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; const ocl::Device d = ocl::Device::getDefault();
bool doubleSupport = d.doubleFPConfig() > 0;
int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1); int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
bool haveMask = !_mask.empty(); bool haveMask = !_mask.empty();
@ -1388,12 +1391,12 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
return false; return false;
int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst); int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
int scalarcn = kercn == 3 ? 4 : kercn; int scalarcn = kercn == 3 ? 4 : kercn, rowsPerWI = d.isIntel() ? 4 : 1;
char cvtstr[4][32], opts[1024]; char cvtstr[4][32], opts[1024];
sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT1_C1=%s -D srcT2=%s -D srcT2_C1=%s " sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT1_C1=%s -D srcT2=%s -D srcT2_C1=%s "
"-D dstT=%s -D dstT_C1=%s -D workT=%s -D workST=%s -D scaleT=%s -D wdepth=%d -D convertToWT1=%s " "-D dstT=%s -D dstT_C1=%s -D workT=%s -D workST=%s -D scaleT=%s -D wdepth=%d -D convertToWT1=%s "
"-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d", "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d -D rowsPerWI=%d",
(haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"), (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"),
oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)), oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)),
ocl::typeToStr(depth1), ocl::typeToStr(CV_MAKETYPE(depth2, kercn)), ocl::typeToStr(depth1), ocl::typeToStr(CV_MAKETYPE(depth2, kercn)),
@ -1404,7 +1407,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]), ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]),
ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]), ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]),
ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]), ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]),
doubleSupport ? " -D DOUBLE_SUPPORT" : "", kercn); doubleSupport ? " -D DOUBLE_SUPPORT" : "", kercn, rowsPerWI);
size_t usrdata_esz = CV_ELEM_SIZE(wdepth); size_t usrdata_esz = CV_ELEM_SIZE(wdepth);
const uchar* usrdata_p = (const uchar*)usrdata; const uchar* usrdata_p = (const uchar*)usrdata;
@ -1478,7 +1481,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
k.args(src1arg, src2arg, maskarg, dstarg); k.args(src1arg, src2arg, maskarg, dstarg);
} }
size_t globalsize[] = { src1.cols * cn / kercn, src1.rows }; size_t globalsize[] = { src1.cols * cn / kercn, (src1.rows + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, NULL, false); return k.run(2, globalsize, NULL, false);
} }
@ -2764,7 +2767,7 @@ static bool ocl_compare(InputArray _src1, InputArray _src2, OutputArray _dst, in
if (!haveScalar && (!_src1.sameSize(_src2) || type1 != type2)) if (!haveScalar && (!_src1.sameSize(_src2) || type1 != type2))
return false; return false;
int kercn = haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst); int kercn = haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst), rowsPerWI = dev.isIntel() ? 4 : 1;
// Workaround for bug with "?:" operator in AMD OpenCL compiler // Workaround for bug with "?:" operator in AMD OpenCL compiler
if (depth1 >= CV_16U) if (depth1 >= CV_16U)
kercn = 1; kercn = 1;
@ -2775,14 +2778,14 @@ static bool ocl_compare(InputArray _src1, InputArray _src2, OutputArray _dst, in
String opts = format("-D %s -D srcT1=%s -D dstT=%s -D workT=srcT1 -D cn=%d" String opts = format("-D %s -D srcT1=%s -D dstT=%s -D workT=srcT1 -D cn=%d"
" -D convertToDT=%s -D OP_CMP -D CMP_OPERATOR=%s -D srcT1_C1=%s" " -D convertToDT=%s -D OP_CMP -D CMP_OPERATOR=%s -D srcT1_C1=%s"
" -D srcT2_C1=%s -D dstT_C1=%s -D workST=%s%s", " -D srcT2_C1=%s -D dstT_C1=%s -D workST=%s -D rowsPerWI=%d%s",
haveScalar ? "UNARY_OP" : "BINARY_OP", haveScalar ? "UNARY_OP" : "BINARY_OP",
ocl::typeToStr(CV_MAKE_TYPE(depth1, kercn)), ocl::typeToStr(CV_MAKE_TYPE(depth1, kercn)),
ocl::typeToStr(CV_8UC(kercn)), kercn, ocl::typeToStr(CV_8UC(kercn)), kercn,
ocl::convertTypeStr(depth1, CV_8U, kercn, cvt), ocl::convertTypeStr(depth1, CV_8U, kercn, cvt),
operationMap[op], ocl::typeToStr(depth1), operationMap[op], ocl::typeToStr(depth1),
ocl::typeToStr(depth1), ocl::typeToStr(CV_8U), ocl::typeToStr(depth1), ocl::typeToStr(CV_8U),
ocl::typeToStr(CV_MAKE_TYPE(depth1, scalarcn)), ocl::typeToStr(CV_MAKE_TYPE(depth1, scalarcn)), rowsPerWI,
doubleSupport ? " -D DOUBLE_SUPPORT" : ""); doubleSupport ? " -D DOUBLE_SUPPORT" : "");
ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts); ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
@ -2839,7 +2842,7 @@ static bool ocl_compare(InputArray _src1, InputArray _src2, OutputArray _dst, in
ocl::KernelArg::WriteOnly(dst, cn, kercn)); ocl::KernelArg::WriteOnly(dst, cn, kercn));
} }
size_t globalsize[2] = { dst.cols * cn / kercn, dst.rows }; size_t globalsize[2] = { dst.cols * cn / kercn, (dst.rows + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, NULL, false); return k.run(2, globalsize, NULL, false);
} }
@ -3091,11 +3094,12 @@ static InRangeFunc getInRangeFunc(int depth)
static bool ocl_inRange( InputArray _src, InputArray _lowerb, static bool ocl_inRange( InputArray _src, InputArray _lowerb,
InputArray _upperb, OutputArray _dst ) InputArray _upperb, OutputArray _dst )
{ {
const ocl::Device & d = ocl::Device::getDefault();
int skind = _src.kind(), lkind = _lowerb.kind(), ukind = _upperb.kind(); int skind = _src.kind(), lkind = _lowerb.kind(), ukind = _upperb.kind();
Size ssize = _src.size(), lsize = _lowerb.size(), usize = _upperb.size(); Size ssize = _src.size(), lsize = _lowerb.size(), usize = _upperb.size();
int stype = _src.type(), ltype = _lowerb.type(), utype = _upperb.type(); int stype = _src.type(), ltype = _lowerb.type(), utype = _upperb.type();
int sdepth = CV_MAT_DEPTH(stype), ldepth = CV_MAT_DEPTH(ltype), udepth = CV_MAT_DEPTH(utype); int sdepth = CV_MAT_DEPTH(stype), ldepth = CV_MAT_DEPTH(ltype), udepth = CV_MAT_DEPTH(utype);
int cn = CV_MAT_CN(stype); int cn = CV_MAT_CN(stype), rowsPerWI = d.isIntel() ? 4 : 1;
bool lbScalar = false, ubScalar = false; bool lbScalar = false, ubScalar = false;
if( (lkind == _InputArray::MATX && skind != _InputArray::MATX) || if( (lkind == _InputArray::MATX && skind != _InputArray::MATX) ||
@ -3119,7 +3123,7 @@ static bool ocl_inRange( InputArray _src, InputArray _lowerb,
if (lbScalar != ubScalar) if (lbScalar != ubScalar)
return false; return false;
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0, bool doubleSupport = d.doubleFPConfig() > 0,
haveScalar = lbScalar && ubScalar; haveScalar = lbScalar && ubScalar;
if ( (!doubleSupport && sdepth == CV_64F) || if ( (!doubleSupport && sdepth == CV_64F) ||
@ -3184,13 +3188,13 @@ static bool ocl_inRange( InputArray _src, InputArray _lowerb,
uscalar.copyTo(uscalaru); uscalar.copyTo(uscalaru);
ker.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(lscalaru), ker.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(lscalaru),
ocl::KernelArg::PtrReadOnly(uscalaru)); ocl::KernelArg::PtrReadOnly(uscalaru), rowsPerWI);
} }
else else
ker.args(srcarg, dstarg, ocl::KernelArg::ReadOnlyNoSize(lscalaru), ker.args(srcarg, dstarg, ocl::KernelArg::ReadOnlyNoSize(lscalaru),
ocl::KernelArg::ReadOnlyNoSize(uscalaru)); ocl::KernelArg::ReadOnlyNoSize(uscalaru), rowsPerWI);
size_t globalsize[2] = { ssize.width, ssize.height }; size_t globalsize[2] = { ssize.width, (ssize.height + rowsPerWI - 1) / rowsPerWI };
return ker.run(2, globalsize, NULL, false); return ker.run(2, globalsize, NULL, false);
} }

@ -270,21 +270,22 @@ namespace cv {
static bool ocl_split( InputArray _m, OutputArrayOfArrays _mv ) static bool ocl_split( InputArray _m, OutputArrayOfArrays _mv )
{ {
int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
String dstargs, dstdecl, processelem; String dstargs, processelem, indexdecl;
for (int i = 0; i < cn; ++i) for (int i = 0; i < cn; ++i)
{ {
dstargs += format("DECLARE_DST_PARAM(%d)", i); dstargs += format("DECLARE_DST_PARAM(%d)", i);
dstdecl += format("DECLARE_DATA(%d)", i); indexdecl += format("DECLARE_INDEX(%d)", i);
processelem += format("PROCESS_ELEM(%d)", i); processelem += format("PROCESS_ELEM(%d)", i);
} }
ocl::Kernel k("split", ocl::core::split_merge_oclsrc, ocl::Kernel k("split", ocl::core::split_merge_oclsrc,
format("-D T=%s -D OP_SPLIT -D cn=%d -D DECLARE_DST_PARAMS=%s " format("-D T=%s -D OP_SPLIT -D cn=%d -D DECLARE_DST_PARAMS=%s"
"-D DECLARE_DATA_N=%s -D PROCESS_ELEMS_N=%s", " -D PROCESS_ELEMS_N=%s -D DECLARE_INDEX_N=%s",
ocl::memopTypeToStr(depth), cn, dstargs.c_str(), ocl::memopTypeToStr(depth), cn, dstargs.c_str(),
dstdecl.c_str(), processelem.c_str())); processelem.c_str(), indexdecl.c_str()));
if (k.empty()) if (k.empty())
return false; return false;
@ -299,8 +300,9 @@ static bool ocl_split( InputArray _m, OutputArrayOfArrays _mv )
int argidx = k.set(0, ocl::KernelArg::ReadOnly(_m.getUMat())); int argidx = k.set(0, ocl::KernelArg::ReadOnly(_m.getUMat()));
for (int i = 0; i < cn; ++i) for (int i = 0; i < cn; ++i)
argidx = k.set(argidx, ocl::KernelArg::WriteOnlyNoSize(dst[i])); argidx = k.set(argidx, ocl::KernelArg::WriteOnlyNoSize(dst[i]));
k.set(argidx, rowsPerWI);
size_t globalsize[2] = { size.width, size.height }; size_t globalsize[2] = { size.width, (size.height + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, NULL, false); return k.run(2, globalsize, NULL, false);
} }
@ -419,7 +421,8 @@ static bool ocl_merge( InputArrayOfArrays _mv, OutputArray _dst )
_mv.getUMatVector(src); _mv.getUMatVector(src);
CV_Assert(!src.empty()); CV_Assert(!src.empty());
int type = src[0].type(), depth = CV_MAT_DEPTH(type); int type = src[0].type(), depth = CV_MAT_DEPTH(type),
rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
Size size = src[0].size(); Size size = src[0].size();
for (size_t i = 0, srcsize = src.size(); i < srcsize; ++i) for (size_t i = 0, srcsize = src.size(); i < srcsize; ++i)
@ -440,20 +443,20 @@ static bool ocl_merge( InputArrayOfArrays _mv, OutputArray _dst )
} }
int dcn = (int)ksrc.size(); int dcn = (int)ksrc.size();
String srcargs, srcdecl, processelem, cndecl; String srcargs, processelem, cndecl, indexdecl;
for (int i = 0; i < dcn; ++i) for (int i = 0; i < dcn; ++i)
{ {
srcargs += format("DECLARE_SRC_PARAM(%d)", i); srcargs += format("DECLARE_SRC_PARAM(%d)", i);
srcdecl += format("DECLARE_DATA(%d)", i);
processelem += format("PROCESS_ELEM(%d)", i); processelem += format("PROCESS_ELEM(%d)", i);
indexdecl += format("DECLARE_INDEX(%d)", i);
cndecl += format(" -D scn%d=%d", i, ksrc[i].channels()); cndecl += format(" -D scn%d=%d", i, ksrc[i].channels());
} }
ocl::Kernel k("merge", ocl::core::split_merge_oclsrc, ocl::Kernel k("merge", ocl::core::split_merge_oclsrc,
format("-D OP_MERGE -D cn=%d -D T=%s -D DECLARE_SRC_PARAMS_N=%s" format("-D OP_MERGE -D cn=%d -D T=%s -D DECLARE_SRC_PARAMS_N=%s"
" -D DECLARE_DATA_N=%s -D PROCESS_ELEMS_N=%s%s", " -D DECLARE_INDEX_N=%s -D PROCESS_ELEMS_N=%s%s",
dcn, ocl::memopTypeToStr(depth), srcargs.c_str(), dcn, ocl::memopTypeToStr(depth), srcargs.c_str(),
srcdecl.c_str(), processelem.c_str(), cndecl.c_str())); indexdecl.c_str(), processelem.c_str(), cndecl.c_str()));
if (k.empty()) if (k.empty())
return false; return false;
@ -463,9 +466,10 @@ static bool ocl_merge( InputArrayOfArrays _mv, OutputArray _dst )
int argidx = 0; int argidx = 0;
for (int i = 0; i < dcn; ++i) for (int i = 0; i < dcn; ++i)
argidx = k.set(argidx, ocl::KernelArg::ReadOnlyNoSize(ksrc[i])); argidx = k.set(argidx, ocl::KernelArg::ReadOnlyNoSize(ksrc[i]));
k.set(argidx, ocl::KernelArg::WriteOnly(dst)); argidx = k.set(argidx, ocl::KernelArg::WriteOnly(dst));
k.set(argidx, rowsPerWI);
size_t globalsize[2] = { dst.cols, dst.rows }; size_t globalsize[2] = { dst.cols, (dst.rows + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, NULL, false); return k.run(2, globalsize, NULL, false);
} }
@ -683,14 +687,15 @@ static bool ocl_mixChannels(InputArrayOfArrays _src, InputOutputArrayOfArrays _d
CV_Assert(nsrc > 0 && ndst > 0); CV_Assert(nsrc > 0 && ndst > 0);
Size size = src[0].size(); Size size = src[0].size();
int depth = src[0].depth(), esz = CV_ELEM_SIZE(depth); int depth = src[0].depth(), esz = CV_ELEM_SIZE(depth),
rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
for (size_t i = 1, ssize = src.size(); i < ssize; ++i) for (size_t i = 1, ssize = src.size(); i < ssize; ++i)
CV_Assert(src[i].size() == size && src[i].depth() == depth); CV_Assert(src[i].size() == size && src[i].depth() == depth);
for (size_t i = 0, dsize = dst.size(); i < dsize; ++i) for (size_t i = 0, dsize = dst.size(); i < dsize; ++i)
CV_Assert(dst[i].size() == size && dst[i].depth() == depth); CV_Assert(dst[i].size() == size && dst[i].depth() == depth);
String declsrc, decldst, declproc, declcn; String declsrc, decldst, declproc, declcn, indexdecl;
std::vector<UMat> srcargs(npairs), dstargs(npairs); std::vector<UMat> srcargs(npairs), dstargs(npairs);
for (size_t i = 0; i < npairs; ++i) for (size_t i = 0; i < npairs; ++i)
@ -711,14 +716,16 @@ static bool ocl_mixChannels(InputArrayOfArrays _src, InputOutputArrayOfArrays _d
declsrc += format("DECLARE_INPUT_MAT(%d)", i); declsrc += format("DECLARE_INPUT_MAT(%d)", i);
decldst += format("DECLARE_OUTPUT_MAT(%d)", i); decldst += format("DECLARE_OUTPUT_MAT(%d)", i);
indexdecl += format("DECLARE_INDEX(%d)", i);
declproc += format("PROCESS_ELEM(%d)", i); declproc += format("PROCESS_ELEM(%d)", i);
declcn += format(" -D scn%d=%d -D dcn%d=%d", i, src[src_idx].channels(), i, dst[dst_idx].channels()); declcn += format(" -D scn%d=%d -D dcn%d=%d", i, src[src_idx].channels(), i, dst[dst_idx].channels());
} }
ocl::Kernel k("mixChannels", ocl::core::mixchannels_oclsrc, ocl::Kernel k("mixChannels", ocl::core::mixchannels_oclsrc,
format("-D T=%s -D DECLARE_INPUT_MATS=%s -D DECLARE_OUTPUT_MATS=%s" format("-D T=%s -D DECLARE_INPUT_MAT_N=%s -D DECLARE_OUTPUT_MAT_N=%s"
" -D PROCESS_ELEMS=%s%s", ocl::memopTypeToStr(depth), " -D PROCESS_ELEM_N=%s -D DECLARE_INDEX_N=%s%s",
declsrc.c_str(), decldst.c_str(), declproc.c_str(), declcn.c_str())); ocl::memopTypeToStr(depth), declsrc.c_str(), decldst.c_str(),
declproc.c_str(), indexdecl.c_str(), declcn.c_str()));
if (k.empty()) if (k.empty())
return false; return false;
@ -727,9 +734,11 @@ static bool ocl_mixChannels(InputArrayOfArrays _src, InputOutputArrayOfArrays _d
argindex = k.set(argindex, ocl::KernelArg::ReadOnlyNoSize(srcargs[i])); argindex = k.set(argindex, ocl::KernelArg::ReadOnlyNoSize(srcargs[i]));
for (size_t i = 0; i < npairs; ++i) for (size_t i = 0; i < npairs; ++i)
argindex = k.set(argindex, ocl::KernelArg::WriteOnlyNoSize(dstargs[i])); argindex = k.set(argindex, ocl::KernelArg::WriteOnlyNoSize(dstargs[i]));
k.set(k.set(argindex, size.height), size.width); argindex = k.set(argindex, size.height);
argindex = k.set(argindex, size.width);
k.set(argindex, rowsPerWI);
size_t globalsize[2] = { size.width, size.height }; size_t globalsize[2] = { size.width, (size.height + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, NULL, false); return k.run(2, globalsize, NULL, false);
} }
@ -1357,9 +1366,10 @@ static BinaryFunc getConvertScaleFunc(int sdepth, int ddepth)
static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, double beta ) static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, double beta )
{ {
const ocl::Device & d = ocl::Device::getDefault();
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
kercn = ocl::predictOptimalVectorWidth(_src, _dst); kercn = ocl::predictOptimalVectorWidth(_src, _dst), rowsPerWI = d.isIntel() ? 4 : 1;
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; bool doubleSupport = d.doubleFPConfig() > 0;
if (!doubleSupport && depth == CV_64F) if (!doubleSupport && depth == CV_64F)
return false; return false;
@ -1368,13 +1378,14 @@ static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha
int wdepth = std::max(depth, CV_32F); int wdepth = std::max(depth, CV_32F);
ocl::Kernel k("KF", ocl::core::arithm_oclsrc, ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
format("-D OP_CONVERT_SCALE_ABS -D UNARY_OP -D dstT=%s -D srcT1=%s" format("-D OP_CONVERT_SCALE_ABS -D UNARY_OP -D dstT=%s -D srcT1=%s"
" -D workT=%s -D wdepth=%d -D convertToWT1=%s -D convertToDT=%s -D workT1=%s%s", " -D workT=%s -D wdepth=%d -D convertToWT1=%s -D convertToDT=%s"
" -D workT1=%s -D rowsPerWI=%d%s",
ocl::typeToStr(CV_8UC(kercn)), ocl::typeToStr(CV_8UC(kercn)),
ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)), wdepth, ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)), wdepth,
ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]), ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]),
ocl::convertTypeStr(wdepth, CV_8U, kercn, cvt[1]), ocl::convertTypeStr(wdepth, CV_8U, kercn, cvt[1]),
ocl::typeToStr(wdepth), ocl::typeToStr(wdepth), rowsPerWI,
doubleSupport ? " -D DOUBLE_SUPPORT" : "")); doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
if (k.empty()) if (k.empty())
return false; return false;
@ -1391,7 +1402,7 @@ static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha
else if (wdepth == CV_64F) else if (wdepth == CV_64F)
k.args(srcarg, dstarg, alpha, beta); k.args(srcarg, dstarg, alpha, beta);
size_t globalsize[2] = { src.cols * cn / kercn, src.rows }; size_t globalsize[2] = { src.cols * cn / kercn, (src.rows + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, NULL, false); return k.run(2, globalsize, NULL, false);
} }

@ -2489,7 +2489,8 @@ namespace cv {
static bool ocl_mulSpectrums( InputArray _srcA, InputArray _srcB, static bool ocl_mulSpectrums( InputArray _srcA, InputArray _srcB,
OutputArray _dst, int flags, bool conjB ) OutputArray _dst, int flags, bool conjB )
{ {
int atype = _srcA.type(), btype = _srcB.type(); int atype = _srcA.type(), btype = _srcB.type(),
rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
Size asize = _srcA.size(), bsize = _srcB.size(); Size asize = _srcA.size(), bsize = _srcB.size();
CV_Assert(asize == bsize); CV_Assert(asize == bsize);
@ -2509,9 +2510,9 @@ static bool ocl_mulSpectrums( InputArray _srcA, InputArray _srcB,
return false; return false;
k.args(ocl::KernelArg::ReadOnlyNoSize(A), ocl::KernelArg::ReadOnlyNoSize(B), k.args(ocl::KernelArg::ReadOnlyNoSize(A), ocl::KernelArg::ReadOnlyNoSize(B),
ocl::KernelArg::WriteOnly(dst)); ocl::KernelArg::WriteOnly(dst), rowsPerWI);
size_t globalsize[2] = { asize.width, asize.height }; size_t globalsize[2] = { asize.width, (asize.height + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, NULL, false); return k.run(2, globalsize, NULL, false);
} }

@ -65,13 +65,15 @@ static bool ocl_math_op(InputArray _src1, InputArray _src2, OutputArray _dst, in
int kercn = oclop == OCL_OP_PHASE_DEGREES || int kercn = oclop == OCL_OP_PHASE_DEGREES ||
oclop == OCL_OP_PHASE_RADIANS ? 1 : ocl::predictOptimalVectorWidth(_src1, _src2, _dst); oclop == OCL_OP_PHASE_RADIANS ? 1 : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
bool double_support = ocl::Device::getDefault().doubleFPConfig() > 0; const ocl::Device d = ocl::Device::getDefault();
bool double_support = d.doubleFPConfig() > 0;
if (!double_support && depth == CV_64F) if (!double_support && depth == CV_64F)
return false; return false;
int rowsPerWI = d.isIntel() ? 4 : 1;
ocl::Kernel k("KF", ocl::core::arithm_oclsrc, ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
format("-D %s -D %s -D dstT=%s%s", _src2.empty() ? "UNARY_OP" : "BINARY_OP", format("-D %s -D %s -D dstT=%s -D rowsPerWI=%d%s", _src2.empty() ? "UNARY_OP" : "BINARY_OP",
oclop2str[oclop], ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), oclop2str[oclop], ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), rowsPerWI,
double_support ? " -D DOUBLE_SUPPORT" : "")); double_support ? " -D DOUBLE_SUPPORT" : ""));
if (k.empty()) if (k.empty())
return false; return false;
@ -89,7 +91,7 @@ static bool ocl_math_op(InputArray _src1, InputArray _src2, OutputArray _dst, in
else else
k.args(src1arg, src2arg, dstarg); k.args(src1arg, src2arg, dstarg);
size_t globalsize[] = { src1.cols * cn / kercn, src1.rows }; size_t globalsize[] = { src1.cols * cn / kercn, (src1.rows + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, 0, false); return k.run(2, globalsize, 0, false);
} }
@ -524,8 +526,10 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre
static bool ocl_cartToPolar( InputArray _src1, InputArray _src2, static bool ocl_cartToPolar( InputArray _src1, InputArray _src2,
OutputArray _dst1, OutputArray _dst2, bool angleInDegrees ) OutputArray _dst1, OutputArray _dst2, bool angleInDegrees )
{ {
int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); const ocl::Device & d = ocl::Device::getDefault();
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
rowsPerWI = d.isIntel() ? 4 : 1;
bool doubleSupport = d.doubleFPConfig() > 0;
if ( !(_src1.dims() <= 2 && _src2.dims() <= 2 && if ( !(_src1.dims() <= 2 && _src2.dims() <= 2 &&
(depth == CV_32F || depth == CV_64F) && type == _src2.type()) || (depth == CV_32F || depth == CV_64F) && type == _src2.type()) ||
@ -533,9 +537,9 @@ static bool ocl_cartToPolar( InputArray _src1, InputArray _src2,
return false; return false;
ocl::Kernel k("KF", ocl::core::arithm_oclsrc, ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
format("-D BINARY_OP -D dstT=%s -D depth=%d -D OP_CTP_%s%s", format("-D BINARY_OP -D dstT=%s -D depth=%d -D rowsPerWI=%d -D OP_CTP_%s%s",
ocl::typeToStr(CV_MAKE_TYPE(depth, 1)), ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
depth, angleInDegrees ? "AD" : "AR", depth, rowsPerWI, angleInDegrees ? "AD" : "AR",
doubleSupport ? " -D DOUBLE_SUPPORT" : "")); doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
if (k.empty()) if (k.empty())
return false; return false;
@ -553,7 +557,7 @@ static bool ocl_cartToPolar( InputArray _src1, InputArray _src2,
ocl::KernelArg::WriteOnly(dst1, cn), ocl::KernelArg::WriteOnly(dst1, cn),
ocl::KernelArg::WriteOnlyNoSize(dst2)); ocl::KernelArg::WriteOnlyNoSize(dst2));
size_t globalsize[2] = { dst1.cols * cn, dst1.rows }; size_t globalsize[2] = { dst1.cols * cn, (dst1.rows + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, NULL, false); return k.run(2, globalsize, NULL, false);
} }
@ -713,16 +717,18 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval,
static bool ocl_polarToCart( InputArray _mag, InputArray _angle, static bool ocl_polarToCart( InputArray _mag, InputArray _angle,
OutputArray _dst1, OutputArray _dst2, bool angleInDegrees ) OutputArray _dst1, OutputArray _dst2, bool angleInDegrees )
{ {
int type = _angle.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); const ocl::Device & d = ocl::Device::getDefault();
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; int type = _angle.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
rowsPerWI = d.isIntel() ? 4 : 1;
bool doubleSupport = d.doubleFPConfig() > 0;
if ( !doubleSupport && depth == CV_64F ) if ( !doubleSupport && depth == CV_64F )
return false; return false;
ocl::Kernel k("KF", ocl::core::arithm_oclsrc, ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
format("-D dstT=%s -D depth=%d -D BINARY_OP -D OP_PTC_%s%s", format("-D dstT=%s -D rowsPerWI=%d -D depth=%d -D BINARY_OP -D OP_PTC_%s%s",
ocl::typeToStr(CV_MAKE_TYPE(depth, 1)), depth, ocl::typeToStr(CV_MAKE_TYPE(depth, 1)), rowsPerWI,
angleInDegrees ? "AD" : "AR", depth, angleInDegrees ? "AD" : "AR",
doubleSupport ? " -D DOUBLE_SUPPORT" : "")); doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
if (k.empty()) if (k.empty())
return false; return false;
@ -738,7 +744,7 @@ static bool ocl_polarToCart( InputArray _mag, InputArray _angle,
k.args(ocl::KernelArg::ReadOnlyNoSize(mag), ocl::KernelArg::ReadOnlyNoSize(angle), k.args(ocl::KernelArg::ReadOnlyNoSize(mag), ocl::KernelArg::ReadOnlyNoSize(angle),
ocl::KernelArg::WriteOnly(dst1, cn), ocl::KernelArg::WriteOnlyNoSize(dst2)); ocl::KernelArg::WriteOnly(dst1, cn), ocl::KernelArg::WriteOnlyNoSize(dst2));
size_t globalsize[2] = { dst1.cols * cn, dst1.rows }; size_t globalsize[2] = { dst1.cols * cn, (dst1.rows + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, NULL, false); return k.run(2, globalsize, NULL, false);
} }
@ -2103,8 +2109,10 @@ static IPowFunc ipowTab[] =
static bool ocl_pow(InputArray _src, double power, OutputArray _dst, static bool ocl_pow(InputArray _src, double power, OutputArray _dst,
bool is_ipower, int ipower) bool is_ipower, int ipower)
{ {
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); const ocl::Device & d = ocl::Device::getDefault();
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
rowsPerWI = d.isIntel() ? 4 : 1;
bool doubleSupport = d.doubleFPConfig() > 0;
if (depth == CV_64F && !doubleSupport) if (depth == CV_64F && !doubleSupport)
return false; return false;
@ -2113,8 +2121,8 @@ static bool ocl_pow(InputArray _src, double power, OutputArray _dst,
const char * const op = issqrt ? "OP_SQRT" : is_ipower ? "OP_POWN" : "OP_POW"; const char * const op = issqrt ? "OP_SQRT" : is_ipower ? "OP_POWN" : "OP_POW";
ocl::Kernel k("KF", ocl::core::arithm_oclsrc, ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
format("-D dstT=%s -D %s -D UNARY_OP%s", ocl::typeToStr(depth), format("-D dstT=%s -D rowsPerWI=%d -D %s -D UNARY_OP%s", ocl::typeToStr(depth),
op, doubleSupport ? " -D DOUBLE_SUPPORT" : "")); rowsPerWI, op, doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
if (k.empty()) if (k.empty())
return false; return false;
@ -2137,7 +2145,7 @@ static bool ocl_pow(InputArray _src, double power, OutputArray _dst,
k.args(srcarg, dstarg, power); k.args(srcarg, dstarg, power);
} }
size_t globalsize[2] = { dst.cols * cn, dst.rows }; size_t globalsize[2] = { dst.cols * cn, (dst.rows + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, NULL, false); return k.run(2, globalsize, NULL, false);
} }
@ -2491,8 +2499,10 @@ bool checkRange(InputArray _src, bool quiet, Point* pt, double minVal, double ma
static bool ocl_patchNaNs( InputOutputArray _a, float value ) static bool ocl_patchNaNs( InputOutputArray _a, float value )
{ {
int rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
ocl::Kernel k("KF", ocl::core::arithm_oclsrc, ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
format("-D UNARY_OP -D OP_PATCH_NANS -D dstT=int")); format("-D UNARY_OP -D OP_PATCH_NANS -D dstT=int -D rowsPerWI=%d",
rowsPerWI));
if (k.empty()) if (k.empty())
return false; return false;
@ -2502,7 +2512,7 @@ static bool ocl_patchNaNs( InputOutputArray _a, float value )
k.args(ocl::KernelArg::ReadOnlyNoSize(a), k.args(ocl::KernelArg::ReadOnlyNoSize(a),
ocl::KernelArg::WriteOnly(a, cn), (float)value); ocl::KernelArg::WriteOnly(a, cn), (float)value);
size_t globalsize[2] = { a.cols * cn, a.rows }; size_t globalsize[2] = { a.cols * cn, (a.rows + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, NULL, false); return k.run(2, globalsize, NULL, false);
} }

@ -2153,9 +2153,10 @@ typedef void (*ScaleAddFunc)(const uchar* src1, const uchar* src2, uchar* dst, i
static bool ocl_scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray _dst, int type ) static bool ocl_scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray _dst, int type )
{ {
const ocl::Device & d = ocl::Device::getDefault();
int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), wdepth = std::max(depth, CV_32F), int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), wdepth = std::max(depth, CV_32F),
kercn = ocl::predictOptimalVectorWidth(_src1, _src2, _dst); kercn = ocl::predictOptimalVectorWidth(_src1, _src2, _dst), rowsPerWI = d.isIntel() ? 4 : 1;
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; bool doubleSupport = d.doubleFPConfig() > 0;
Size size = _src1.size(); Size size = _src1.size();
if ( (!doubleSupport && depth == CV_64F) || size != _src2.size() ) if ( (!doubleSupport && depth == CV_64F) || size != _src2.size() )
@ -2164,13 +2165,14 @@ static bool ocl_scaleAdd( InputArray _src1, double alpha, InputArray _src2, Outp
char cvt[2][50]; char cvt[2][50];
ocl::Kernel k("KF", ocl::core::arithm_oclsrc, ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
format("-D OP_SCALE_ADD -D BINARY_OP -D dstT=%s -D workT=%s -D convertToWT1=%s" format("-D OP_SCALE_ADD -D BINARY_OP -D dstT=%s -D workT=%s -D convertToWT1=%s"
" -D srcT1=dstT -D srcT2=dstT -D convertToDT=%s -D workT1=%s -D wdepth=%d%s", " -D srcT1=dstT -D srcT2=dstT -D convertToDT=%s -D workT1=%s"
" -D wdepth=%d%s -D rowsPerWI=%d",
ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)), ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)),
ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]), ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]),
ocl::convertTypeStr(wdepth, depth, kercn, cvt[1]), ocl::convertTypeStr(wdepth, depth, kercn, cvt[1]),
ocl::typeToStr(wdepth), wdepth, ocl::typeToStr(wdepth), wdepth,
doubleSupport ? " -D DOUBLE_SUPPORT" : "")); doubleSupport ? " -D DOUBLE_SUPPORT" : "", rowsPerWI));
if (k.empty()) if (k.empty())
return false; return false;
@ -2187,7 +2189,7 @@ static bool ocl_scaleAdd( InputArray _src1, double alpha, InputArray _src2, Outp
else else
k.args(src1arg, src2arg, dstarg, alpha); k.args(src1arg, src2arg, dstarg, alpha);
size_t globalsize[2] = { dst.cols * cn / kercn, dst.rows }; size_t globalsize[2] = { dst.cols * cn / kercn, (dst.rows + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, NULL, false); return k.run(2, globalsize, NULL, false);
} }

@ -2742,7 +2742,8 @@ namespace cv {
static bool ocl_setIdentity( InputOutputArray _m, const Scalar& s ) static bool ocl_setIdentity( InputOutputArray _m, const Scalar& s )
{ {
int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
sctype = CV_MAKE_TYPE(depth, cn == 3 ? 4 : cn); sctype = CV_MAKE_TYPE(depth, cn == 3 ? 4 : cn),
rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
ocl::Kernel k("setIdentity", ocl::core::set_identity_oclsrc, ocl::Kernel k("setIdentity", ocl::core::set_identity_oclsrc,
format("-D T=%s -D T1=%s -D cn=%d -D ST=%s", ocl::memopTypeToStr(type), format("-D T=%s -D T1=%s -D cn=%d -D ST=%s", ocl::memopTypeToStr(type),
@ -2751,9 +2752,10 @@ static bool ocl_setIdentity( InputOutputArray _m, const Scalar& s )
return false; return false;
UMat m = _m.getUMat(); UMat m = _m.getUMat();
k.args(ocl::KernelArg::WriteOnly(m), ocl::KernelArg::Constant(Mat(1, 1, sctype, s))); k.args(ocl::KernelArg::WriteOnly(m), ocl::KernelArg::Constant(Mat(1, 1, sctype, s)),
rowsPerWI);
size_t globalsize[2] = { m.cols, m.rows }; size_t globalsize[2] = { m.cols, (m.rows + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, NULL, false); return k.run(2, globalsize, NULL, false);
} }

@ -145,6 +145,7 @@
#define EXTRA_PARAMS #define EXTRA_PARAMS
#define EXTRA_INDEX #define EXTRA_INDEX
#define EXTRA_INDEX_ADD
#if defined OP_ADD #if defined OP_ADD
#define PROCESS_ELEM storedst(convertToDT(srcelem1 + srcelem2)) #define PROCESS_ELEM storedst(convertToDT(srcelem1 + srcelem2))
@ -363,7 +364,9 @@
#undef EXTRA_PARAMS #undef EXTRA_PARAMS
#define EXTRA_PARAMS , __global uchar* dstptr2, int dststep2, int dstoffset2 #define EXTRA_PARAMS , __global uchar* dstptr2, int dststep2, int dstoffset2
#undef EXTRA_INDEX #undef EXTRA_INDEX
#define EXTRA_INDEX int dst_index2 = mad24(y, dststep2, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset2)) #define EXTRA_INDEX int dst_index2 = mad24(y0, dststep2, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset2))
#undef EXTRA_INDEX_ADD
#define EXTRA_INDEX_ADD dst_index2 += dststep2
#endif #endif
#if defined UNARY_OP || defined MASK_UNARY_OP #if defined UNARY_OP || defined MASK_UNARY_OP
@ -393,18 +396,25 @@ __kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,
int rows, int cols EXTRA_PARAMS ) int rows, int cols EXTRA_PARAMS )
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y0 = get_global_id(1) * rowsPerWI;
if (x < cols && y < rows) if (x < cols)
{ {
int src1_index = mad24(y, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1)); int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
#if !(defined(OP_RECIP_SCALE) || defined(OP_NOT)) #if !(defined(OP_RECIP_SCALE) || defined(OP_NOT))
int src2_index = mad24(y, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2)); int src2_index = mad24(y0, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2));
#endif #endif
int dst_index = mad24(y, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset)); int dst_index = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
EXTRA_INDEX; EXTRA_INDEX;
PROCESS_ELEM; for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, dst_index += dststep)
{
PROCESS_ELEM;
#if !(defined(OP_RECIP_SCALE) || defined(OP_NOT))
src2_index += srcstep2;
#endif
EXTRA_INDEX_ADD;
}
} }
} }
@ -417,19 +427,21 @@ __kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,
int rows, int cols EXTRA_PARAMS ) int rows, int cols EXTRA_PARAMS )
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y0 = get_global_id(1) * rowsPerWI;
if (x < cols && y < rows) if (x < cols)
{ {
int mask_index = mad24(y, maskstep, x + maskoffset); int mask_index = mad24(y0, maskstep, x + maskoffset);
if( mask[mask_index] ) int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
{ int src2_index = mad24(y0, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2));
int src1_index = mad24(y, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1)); int dst_index = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
int src2_index = mad24(y, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2));
int dst_index = mad24(y, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset)); for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, src2_index += srcstep2,
mask_index += maskstep, dst_index += dststep)
PROCESS_ELEM; if (mask[mask_index])
} {
PROCESS_ELEM;
}
} }
} }
@ -440,14 +452,17 @@ __kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,
int rows, int cols EXTRA_PARAMS ) int rows, int cols EXTRA_PARAMS )
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y0 = get_global_id(1) * rowsPerWI;
if (x < cols && y < rows) if (x < cols)
{ {
int src1_index = mad24(y, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1)); int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
int dst_index = mad24(y, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset)); int dst_index = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
PROCESS_ELEM; for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, dst_index += dststep)
{
PROCESS_ELEM;
}
} }
} }
@ -459,18 +474,19 @@ __kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,
int rows, int cols EXTRA_PARAMS ) int rows, int cols EXTRA_PARAMS )
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y0 = get_global_id(1) * rowsPerWI;
if (x < cols && y < rows) if (x < cols)
{ {
int mask_index = mad24(y, maskstep, x + maskoffset); int mask_index = mad24(y0, maskstep, x + maskoffset);
if( mask[mask_index] ) int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
{ int dst_index = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
int src1_index = mad24(y, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
int dst_index = mad24(y, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset)); for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, mask_index += maskstep, dst_index += dststep)
if (mask[mask_index])
PROCESS_ELEM; {
} PROCESS_ELEM;
}
} }
} }

@ -53,19 +53,22 @@
__kernel void convertTo(__global const uchar * srcptr, int src_step, int src_offset, __kernel void convertTo(__global const uchar * srcptr, int src_step, int src_offset,
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
WT alpha, WT beta) WT alpha, WT beta, int rowsPerWI)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y0 = get_global_id(1) * rowsPerWI;
if (x < dst_cols && y < dst_rows) if (x < dst_cols)
{ {
int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT), src_offset)); int src_index = mad24(y0, src_step, mad24(x, (int)sizeof(srcT), src_offset));
int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(dstT), dst_offset)); int dst_index = mad24(y0, dst_step, mad24(x, (int)sizeof(dstT), dst_offset));
__global const srcT * src = (__global const srcT *)(srcptr + src_index); for (int y = y0, y1 = min(dst_rows, y0 + rowsPerWI); y < y1; ++y, src_index += src_step, dst_index += dst_step)
__global dstT * dst = (__global dstT *)(dstptr + dst_index); {
__global const srcT * src = (__global const srcT *)(srcptr + src_index);
__global dstT * dst = (__global dstT *)(dstptr + dst_index);
dst[0] = convertToDT(mad(convertToWT(src[0]), alpha, beta)); dst[0] = convertToDT(fma(convertToWT(src[0]), alpha, beta));
}
} }
} }

@ -52,37 +52,47 @@
__kernel void inrange(__global const uchar * src1ptr, int src1_step, int src1_offset, __kernel void inrange(__global const uchar * src1ptr, int src1_step, int src1_offset,
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
#ifdef HAVE_SCALAR #ifdef HAVE_SCALAR
__global const T * src2, __global const T * src3 __global const T * src2, __global const T * src3,
#else #else
__global const uchar * src2ptr, int src2_step, int src2_offset, __global const uchar * src2ptr, int src2_step, int src2_offset,
__global const uchar * src3ptr, int src3_step, int src3_offset __global const uchar * src3ptr, int src3_step, int src3_offset,
#endif #endif
) int rowsPerWI)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y0 = get_global_id(1) * rowsPerWI;
if (x < dst_cols && y < dst_rows) if (x < dst_cols)
{ {
int src1_index = mad24(y, src1_step, mad24(x, (int)sizeof(T) * cn, src1_offset)); int src1_index = mad24(y0, src1_step, mad24(x, (int)sizeof(T) * cn, src1_offset));
int dst_index = mad24(y, dst_step, x + dst_offset); int dst_index = mad24(y0, dst_step, x + dst_offset);
__global const T * src1 = (__global const T *)(src1ptr + src1_index); #ifndef HAVE_SCALAR
__global uchar * dst = dstptr + dst_index; int src2_index = mad24(y0, src2_step, mad24(x, (int)sizeof(T) * cn, src2_offset));
int src3_index = mad24(y0, src3_step, mad24(x, (int)sizeof(T) * cn, src3_offset));
#endif
for (int y = y0, y1 = min(dst_rows, y0 + rowsPerWI); y < y1; ++y, src1_index += src1_step, dst_index += dst_step)
{
__global const T * src1 = (__global const T *)(src1ptr + src1_index);
__global uchar * dst = dstptr + dst_index;
#ifndef HAVE_SCALAR #ifndef HAVE_SCALAR
int src2_index = mad24(y, src2_step, mad24(x, (int)sizeof(T) * cn, src2_offset)); __global const T * src2 = (__global const T *)(src2ptr + src2_index);
int src3_index = mad24(y, src3_step, mad24(x, (int)sizeof(T) * cn, src3_offset)); __global const T * src3 = (__global const T *)(src3ptr + src3_index);
__global const T * src2 = (__global const T *)(src2ptr + src2_index);
__global const T * src3 = (__global const T *)(src3ptr + src3_index);
#endif #endif
dst[0] = 255; dst[0] = 255;
for (int c = 0; c < cn; ++c)
if (src2[c] > src1[c] || src3[c] < src1[c])
{
dst[0] = 0;
break;
}
for (int c = 0; c < cn; ++c) #ifndef HAVE_SCALAR
if (src2[c] > src1[c] || src3[c] < src1[c]) src2_index += src2_step;
{ src3_index += src3_step;
dst[0] = 0; #endif
break; }
}
} }
} }

@ -45,20 +45,28 @@
__global const uchar * src##i##ptr, int src##i##_step, int src##i##_offset, __global const uchar * src##i##ptr, int src##i##_step, int src##i##_offset,
#define DECLARE_OUTPUT_MAT(i) \ #define DECLARE_OUTPUT_MAT(i) \
__global uchar * dst##i##ptr, int dst##i##_step, int dst##i##_offset, __global uchar * dst##i##ptr, int dst##i##_step, int dst##i##_offset,
#define DECLARE_INDEX(i) \
int src##i##_index = mad24(src##i##_step, y0, mad24(x, (int)sizeof(T) * scn##i, src##i##_offset)); \
int dst##i##_index = mad24(dst##i##_step, y0, mad24(x, (int)sizeof(T) * dcn##i, dst##i##_offset));
#define PROCESS_ELEM(i) \ #define PROCESS_ELEM(i) \
int src##i##_index = mad24(src##i##_step, y, mad24(x, (int)sizeof(T) * scn##i, src##i##_offset)); \
__global const T * src##i = (__global const T *)(src##i##ptr + src##i##_index); \ __global const T * src##i = (__global const T *)(src##i##ptr + src##i##_index); \
int dst##i##_index = mad24(dst##i##_step, y, mad24(x, (int)sizeof(T) * dcn##i, dst##i##_offset)); \
__global T * dst##i = (__global T *)(dst##i##ptr + dst##i##_index); \ __global T * dst##i = (__global T *)(dst##i##ptr + dst##i##_index); \
dst##i[0] = src##i[0]; dst##i[0] = src##i[0]; \
src##i##_index += src##i##_step; \
dst##i##_index += dst##i##_step;
__kernel void mixChannels(DECLARE_INPUT_MATS DECLARE_OUTPUT_MATS int rows, int cols) __kernel void mixChannels(DECLARE_INPUT_MAT_N DECLARE_OUTPUT_MAT_N int rows, int cols, int rowsPerWI)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y0 = get_global_id(1) * rowsPerWI;
if (x < cols && y < rows) if (x < cols)
{ {
PROCESS_ELEMS DECLARE_INDEX_N
for (int y = y0, y1 = min(y0 + rowsPerWI, rows); y < y1; ++y)
{
PROCESS_ELEM_N
}
} }
} }

@ -56,26 +56,30 @@ inline float2 conjf(float2 a)
__kernel void mulAndScaleSpectrums(__global const uchar * src1ptr, int src1_step, int src1_offset, __kernel void mulAndScaleSpectrums(__global const uchar * src1ptr, int src1_step, int src1_offset,
__global const uchar * src2ptr, int src2_step, int src2_offset, __global const uchar * src2ptr, int src2_step, int src2_offset,
__global uchar * dstptr, int dst_step, int dst_offset, __global uchar * dstptr, int dst_step, int dst_offset,
int dst_rows, int dst_cols) int dst_rows, int dst_cols, int rowsPerWI)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y0 = get_global_id(1) * rowsPerWI;
if (x < dst_cols && y < dst_rows) if (x < dst_cols)
{ {
int src1_index = mad24(y, src1_step, mad24(x, (int)sizeof(float2), src1_offset)); int src1_index = mad24(y0, src1_step, mad24(x, (int)sizeof(float2), src1_offset));
int src2_index = mad24(y, src2_step, mad24(x, (int)sizeof(float2), src2_offset)); int src2_index = mad24(y0, src2_step, mad24(x, (int)sizeof(float2), src2_offset));
int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(float2), dst_offset)); int dst_index = mad24(y0, dst_step, mad24(x, (int)sizeof(float2), dst_offset));
float2 src0 = *(__global const float2 *)(src1ptr + src1_index); for (int y = y0, y1 = min(dst_rows, y0 + rowsPerWI); y < y1; ++y,
float2 src1 = *(__global const float2 *)(src2ptr + src2_index); src1_index += src1_step, src2_index += src2_step, dst_index += dst_step)
__global float2 * dst = (__global float2 *)(dstptr + dst_index); {
float2 src0 = *(__global const float2 *)(src1ptr + src1_index);
float2 src1 = *(__global const float2 *)(src2ptr + src2_index);
__global float2 * dst = (__global float2 *)(dstptr + dst_index);
#ifdef CONJ #ifdef CONJ
float2 v = cmulf(src0, conjf(src1)); float2 v = cmulf(src0, conjf(src1));
#else #else
float2 v = cmulf(src0, src1); float2 v = cmulf(src0, src1);
#endif #endif
dst[0] = v; dst[0] = v;
}
} }
} }

@ -56,15 +56,16 @@
#endif #endif
__kernel void setIdentity(__global uchar * srcptr, int src_step, int src_offset, int rows, int cols, __kernel void setIdentity(__global uchar * srcptr, int src_step, int src_offset, int rows, int cols,
ST scalar_) ST scalar_, int rowsPerWI)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y0 = get_global_id(1) * rowsPerWI;
if (x < cols && y < rows) if (x < cols)
{ {
int src_index = mad24(y, src_step, mad24(x, TSIZE, src_offset)); int src_index = mad24(y0, src_step, mad24(x, TSIZE, src_offset));
storepix(x == y ? scalar : (T)(0), srcptr + src_index); for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src_index += src_step)
storepix(x == y ? scalar : (T)(0), srcptr + src_index);
} }
} }

@ -44,42 +44,58 @@
#ifdef OP_MERGE #ifdef OP_MERGE
#define DECLARE_SRC_PARAM(index) __global const uchar * src##index##ptr, int src##index##_step, int src##index##_offset, #define DECLARE_SRC_PARAM(index) __global const uchar * src##index##ptr, int src##index##_step, int src##index##_offset,
#define DECLARE_DATA(index) __global const T * src##index = \ #define DECLARE_INDEX(index) int src##index##_index = mad24(src##index##_step, y0, mad24(x, (int)sizeof(T) * scn##index, src##index##_offset));
(__global T *)(src##index##ptr + mad24(src##index##_step, y, mad24(x, (int)sizeof(T) * scn##index, src##index##_offset))); #define PROCESS_ELEM(index) \
#define PROCESS_ELEM(index) dst[index] = src##index[0]; __global const T * src##index = (__global const T *)(src##index##ptr + src##index##_index); \
dst[index] = src##index[0]; \
src##index##_index += src##index##_step;
__kernel void merge(DECLARE_SRC_PARAMS_N __kernel void merge(DECLARE_SRC_PARAMS_N
__global uchar * dstptr, int dst_step, int dst_offset, __global uchar * dstptr, int dst_step, int dst_offset,
int rows, int cols) int rows, int cols, int rowsPerWI)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y0 = get_global_id(1) * rowsPerWI;
if (x < cols && y < rows) if (x < cols)
{ {
DECLARE_DATA_N DECLARE_INDEX_N
__global T * dst = (__global T *)(dstptr + mad24(dst_step, y, mad24(x, (int)sizeof(T) * cn, dst_offset))); int dst_index = mad24(dst_step, y0, mad24(x, (int)sizeof(T) * cn, dst_offset));
PROCESS_ELEMS_N
for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, dst_index += dst_step)
{
__global T * dst = (__global T *)(dstptr + dst_index);
PROCESS_ELEMS_N
}
} }
} }
#elif defined OP_SPLIT #elif defined OP_SPLIT
#define DECLARE_DST_PARAM(index) , __global uchar * dst##index##ptr, int dst##index##_step, int dst##index##_offset #define DECLARE_DST_PARAM(index) , __global uchar * dst##index##ptr, int dst##index##_step, int dst##index##_offset
#define DECLARE_DATA(index) __global T * dst##index = \ #define DECLARE_INDEX(index) int dst##index##_index = mad24(y0, dst##index##_step, mad24(x, (int)sizeof(T), dst##index##_offset));
(__global T *)(dst##index##ptr + mad24(y, dst##index##_step, mad24(x, (int)sizeof(T), dst##index##_offset))); #define PROCESS_ELEM(index) \
#define PROCESS_ELEM(index) dst##index[0] = src[index]; __global T * dst##index = (__global T *)(dst##index##ptr + dst##index##_index); \
dst##index[0] = src[index]; \
dst##index##_index += dst##index##_step;
__kernel void split(__global uchar* srcptr, int src_step, int src_offset, int rows, int cols DECLARE_DST_PARAMS) __kernel void split(__global uchar* srcptr, int src_step, int src_offset, int rows, int cols DECLARE_DST_PARAMS, int rowsPerWI)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y0 = get_global_id(1) * rowsPerWI;
if (x < cols && y < rows) if (x < cols)
{ {
DECLARE_DATA_N DECLARE_INDEX_N
__global const T * src = (__global const T *)(srcptr + mad24(y, src_step, mad24(x, cn * (int)sizeof(T), src_offset))); int src_index = mad24(y0, src_step, mad24(x, cn * (int)sizeof(T), src_offset));
PROCESS_ELEMS_N
for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src_index += src_step)
{
__global const T * src = (__global const T *)(srcptr + src_index);
PROCESS_ELEMS_N
}
} }
} }

@ -2035,8 +2035,9 @@ static NormDiffFunc getNormDiffFunc(int normType, int depth)
static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double & result ) static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double & result )
{ {
const ocl::Device & d = ocl::Device::getDefault();
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0, bool doubleSupport = d.doubleFPConfig() > 0,
haveMask = _mask.kind() != _InputArray::NONE; haveMask = _mask.kind() != _InputArray::NONE;
if ( !(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR) || if ( !(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR) ||
@ -2053,13 +2054,14 @@ static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double &
if (depth != CV_8U && depth != CV_16U) if (depth != CV_8U && depth != CV_16U)
{ {
int wdepth = std::max(CV_32S, depth); int wdepth = std::max(CV_32S, depth), rowsPerWI = d.isIntel() ? 4 : 1;
char cvt[50]; char cvt[50];
ocl::Kernel kabs("KF", ocl::core::arithm_oclsrc, ocl::Kernel kabs("KF", ocl::core::arithm_oclsrc,
format("-D UNARY_OP -D OP_ABS_NOSAT -D dstT=%s -D srcT1=%s -D convertToDT=%s%s", format("-D UNARY_OP -D OP_ABS_NOSAT -D dstT=%s -D srcT1=%s"
" -D convertToDT=%s -D rowsPerWI=%d%s",
ocl::typeToStr(wdepth), ocl::typeToStr(depth), ocl::typeToStr(wdepth), ocl::typeToStr(depth),
ocl::convertTypeStr(depth, wdepth, 1, cvt), ocl::convertTypeStr(depth, wdepth, 1, cvt), rowsPerWI,
doubleSupport ? " -D DOUBLE_SUPPORT" : "")); doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
if (kabs.empty()) if (kabs.empty())
return false; return false;
@ -2067,7 +2069,7 @@ static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double &
abssrc.create(src.size(), CV_MAKE_TYPE(wdepth, cn)); abssrc.create(src.size(), CV_MAKE_TYPE(wdepth, cn));
kabs.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(abssrc, cn)); kabs.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(abssrc, cn));
size_t globalsize[2] = { src.cols * cn, src.rows }; size_t globalsize[2] = { src.cols * cn, (src.rows + rowsPerWI - 1) / rowsPerWI };
if (!kabs.run(2, globalsize, NULL, false)) if (!kabs.run(2, globalsize, NULL, false))
return false; return false;
} }
@ -2078,8 +2080,8 @@ static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double &
} }
else else
{ {
int dbsize = ocl::Device::getDefault().maxComputeUnits(); int dbsize = d.maxComputeUnits();
size_t wgs = ocl::Device::getDefault().maxWorkGroupSize(); size_t wgs = d.maxWorkGroupSize();
int wgs2_aligned = 1; int wgs2_aligned = 1;
while (wgs2_aligned < (int)wgs) while (wgs2_aligned < (int)wgs)
@ -2446,8 +2448,9 @@ namespace cv {
static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask, double & result ) static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask, double & result )
{ {
int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); const ocl::Device & d = ocl::Device::getDefault();
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), rowsPerWI = d.isIntel() ? 4 : 1;
bool doubleSupport = d.doubleFPConfig() > 0;
bool relative = (normType & NORM_RELATIVE) != 0; bool relative = (normType & NORM_RELATIVE) != 0;
normType &= ~NORM_RELATIVE; normType &= ~NORM_RELATIVE;
@ -2459,9 +2462,9 @@ static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArr
char cvt[50]; char cvt[50];
ocl::Kernel k("KF", ocl::core::arithm_oclsrc, ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
format("-D BINARY_OP -D OP_ABSDIFF -D dstT=%s -D workT=dstT -D srcT1=%s -D srcT2=srcT1" format("-D BINARY_OP -D OP_ABSDIFF -D dstT=%s -D workT=dstT -D srcT1=%s -D srcT2=srcT1"
" -D convertToDT=%s -D convertToWT1=convertToDT -D convertToWT2=convertToDT%s", " -D convertToDT=%s -D convertToWT1=convertToDT -D convertToWT2=convertToDT -D rowsPerWI=%d%s",
ocl::typeToStr(wdepth), ocl::typeToStr(depth), ocl::typeToStr(wdepth), ocl::typeToStr(depth),
ocl::convertTypeStr(depth, wdepth, 1, cvt), ocl::convertTypeStr(depth, wdepth, 1, cvt), rowsPerWI,
doubleSupport ? " -D DOUBLE_SUPPORT" : "")); doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
if (k.empty()) if (k.empty())
return false; return false;
@ -2470,7 +2473,7 @@ static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArr
k.args(ocl::KernelArg::ReadOnlyNoSize(src1), ocl::KernelArg::ReadOnlyNoSize(src2), k.args(ocl::KernelArg::ReadOnlyNoSize(src1), ocl::KernelArg::ReadOnlyNoSize(src2),
ocl::KernelArg::WriteOnly(diff, cn)); ocl::KernelArg::WriteOnly(diff, cn));
size_t globalsize[2] = { diff.cols * cn, diff.rows }; size_t globalsize[2] = { diff.cols * cn, (diff.rows + rowsPerWI - 1) / rowsPerWI };
if (!k.run(2, globalsize, NULL, false)) if (!k.run(2, globalsize, NULL, false))
return false; return false;

@ -721,7 +721,7 @@ void UMat::convertTo(OutputArray _dst, int _type, double alpha, double beta) con
if( dims <= 2 && cn && _dst.isUMat() && ocl::useOpenCL() && if( dims <= 2 && cn && _dst.isUMat() && ocl::useOpenCL() &&
((needDouble && doubleSupport) || !needDouble) ) ((needDouble && doubleSupport) || !needDouble) )
{ {
int wdepth = std::max(CV_32F, sdepth); int wdepth = std::max(CV_32F, sdepth), rowsPerWI = 4;
char cvt[2][40]; char cvt[2][40];
ocl::Kernel k("convertTo", ocl::core::convert_oclsrc, ocl::Kernel k("convertTo", ocl::core::convert_oclsrc,
@ -741,11 +741,11 @@ void UMat::convertTo(OutputArray _dst, int _type, double alpha, double beta) con
dstarg = ocl::KernelArg::WriteOnly(dst, cn); dstarg = ocl::KernelArg::WriteOnly(dst, cn);
if (wdepth == CV_32F) if (wdepth == CV_32F)
k.args(srcarg, dstarg, alphaf, betaf); k.args(srcarg, dstarg, alphaf, betaf, rowsPerWI);
else else
k.args(srcarg, dstarg, alpha, beta); k.args(srcarg, dstarg, alpha, beta, rowsPerWI);
size_t globalsize[2] = { dst.cols * cn, dst.rows }; size_t globalsize[2] = { dst.cols * cn, (dst.rows + rowsPerWI - 1) / rowsPerWI };
if (k.run(2, globalsize, NULL, false)) if (k.run(2, globalsize, NULL, false))
return; return;
} }

Loading…
Cancel
Save