Merge pull request #24918 from opencv-pushbot:gitee/alalek/core_convertfp16_replacement

core(OpenCL): optimize convertTo() with CV_16F (convertFp16() replacement) #24918

relates #24909
relates #24917
relates #24892

Performance changes:

- [x] 12700K (1 thread) + Intel iGPU

|Name of Test|noOCL|convertFp16|convertTo BASE|convertTo PATCH|
|---|:-:|:-:|:-:|:-:|
|ConvertFP16FP32MatMat::OCL_Core|3.130|3.152|3.127|3.136|
|ConvertFP16FP32MatUMat::OCL_Core|3.030|3.996|3.007|2.671|
|ConvertFP16FP32UMatMat::OCL_Core|3.010|3.101|3.056|2.854|
|ConvertFP16FP32UMatUMat::OCL_Core|3.016|3.298|2.072|2.061|
|ConvertFP32FP16MatMat::OCL_Core|2.697|2.652|2.723|2.721|
|ConvertFP32FP16MatUMat::OCL_Core|2.752|4.268|2.662|2.947|
|ConvertFP32FP16UMatMat::OCL_Core|2.706|2.601|2.603|2.528|
|ConvertFP32FP16UMatUMat::OCL_Core|2.704|3.215|1.999|1.988|

Patched version is not worse than convertFp16 and convertTo baseline (except MatUMat 32->16, baseline uses CPU code+dst buffer map).
There are still gaps against noOpenCL(CPU only) mode due to T-API implementation issues (unnecessary synchronization).


- [x] 12700K + AMD dGPU

|Name of Test|noOCL|convertFp16 dGPU|convertTo BASE dGPU|convertTo PATCH dGPU|
|---|:-:|:-:|:-:|:-:|
|ConvertFP16FP32MatMat::OCL_Core|3.130|3.133|3.172|3.087|
|ConvertFP16FP32MatUMat::OCL_Core|3.030|1.713|9.559|1.729|
|ConvertFP16FP32UMatMat::OCL_Core|3.010|6.515|6.309|4.452|
|ConvertFP16FP32UMatUMat::OCL_Core|3.016|0.242|23.597|0.170|
|ConvertFP32FP16MatMat::OCL_Core|2.697|2.641|2.713|2.689|
|ConvertFP32FP16MatUMat::OCL_Core|2.752|4.076|6.483|4.191|
|ConvertFP32FP16UMatMat::OCL_Core|2.706|9.042|16.481|1.834|
|ConvertFP32FP16UMatUMat::OCL_Core|2.704|0.229|15.730|0.176|

convertTo-baseline can't compile OpenCL kernel for FP16 properly - FIXED.
dGPU has much more power, so results are x16-17 better than single cpu core. 
Patched version is not worse than convertFp16 and convertTo baseline.
There are still gaps against noOpenCL(CPU only) mode due to T-API implementation issues (unnecessary synchronization) and required memory transfers.

Co-authored-by: Alexander Alekhin <alexander.a.alekhin@gmail.com>
pull/24892/head
Alexander Alekhin 10 months ago committed by GitHub
parent ae21368eb9
commit 40533dbf69
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 2
      modules/core/include/opencv2/core.hpp
  2. 5
      modules/core/include/opencv2/core/ocl.hpp
  3. 8
      modules/core/include/opencv2/core/opencl/opencl_info.hpp
  4. 181
      modules/core/perf/opencl/perf_matop.cpp
  5. 154
      modules/core/src/convert.dispatch.cpp
  6. 10
      modules/core/src/ocl.cpp
  7. 3
      modules/core/src/ocl_disabled.impl.hpp
  8. 13
      modules/core/src/opencl/convert.cl
  9. 66
      modules/core/src/umatrix.cpp

@ -556,6 +556,8 @@ The format of half precision floating point is defined in IEEE 754-2008.
@param src input array.
@param dst output array.
@deprecated Use Mat::convertTo with CV_16F instead.
*/
CV_EXPORTS_W void convertFp16(InputArray src, OutputArray dst);

@ -127,6 +127,11 @@ public:
CV_WRAP int singleFPConfig() const;
CV_WRAP int halfFPConfig() const;
/// true if 'cl_khr_fp64' extension is available
CV_WRAP bool hasFP64() const;
/// true if 'cl_khr_fp16' extension is available
CV_WRAP bool hasFP16() const;
CV_WRAP bool endianLittle() const;
CV_WRAP bool errorCorrectionSupport() const;

@ -141,13 +141,13 @@ static void dumpOpenCLInformation()
DUMP_MESSAGE_STDOUT(" Max memory allocation size = " << maxMemAllocSizeStr);
DUMP_CONFIG_PROPERTY("cv_ocl_current_maxMemAllocSize", device.maxMemAllocSize());
const char* doubleSupportStr = device.doubleFPConfig() > 0 ? "Yes" : "No";
const char* doubleSupportStr = device.hasFP64() ? "Yes" : "No";
DUMP_MESSAGE_STDOUT(" Double support = " << doubleSupportStr);
DUMP_CONFIG_PROPERTY("cv_ocl_current_haveDoubleSupport", device.doubleFPConfig() > 0);
DUMP_CONFIG_PROPERTY("cv_ocl_current_haveDoubleSupport", device.hasFP64());
const char* halfSupportStr = device.halfFPConfig() > 0 ? "Yes" : "No";
const char* halfSupportStr = device.hasFP16() ? "Yes" : "No";
DUMP_MESSAGE_STDOUT(" Half support = " << halfSupportStr);
DUMP_CONFIG_PROPERTY("cv_ocl_current_haveHalfSupport", device.halfFPConfig() > 0);
DUMP_CONFIG_PROPERTY("cv_ocl_current_haveHalfSupport", device.hasFP16());
const char* isUnifiedMemoryStr = device.hostUnifiedMemory() ? "Yes" : "No";
DUMP_MESSAGE_STDOUT(" Host unified memory = " << isUnifiedMemoryStr);

@ -80,6 +80,187 @@ OCL_PERF_TEST_P(ConvertToFixture, ConvertTo,
SANITY_CHECK(dst);
}
//#define RUN_CONVERTFP16
static Size convertFP16_srcSize(4000, 4000);
OCL_PERF_TEST(Core, ConvertFP32FP16MatMat)
{
const Size srcSize = convertFP16_srcSize;
const int type = CV_32F;
const int dtype = CV_16F;
checkDeviceMaxMemoryAllocSize(srcSize, type);
checkDeviceMaxMemoryAllocSize(srcSize, dtype);
Mat src(srcSize, type);
Mat dst(srcSize, dtype);
declare.in(src, WARMUP_RNG).out(dst);
#ifdef RUN_CONVERTFP16
OCL_TEST_CYCLE() convertFp16(src, dst);
#else
OCL_TEST_CYCLE() src.convertTo(dst, dtype);
#endif
SANITY_CHECK_NOTHING();
}
OCL_PERF_TEST(Core, ConvertFP32FP16MatUMat)
{
const Size srcSize = convertFP16_srcSize;
const int type = CV_32F;
const int dtype = CV_16F;
checkDeviceMaxMemoryAllocSize(srcSize, type);
checkDeviceMaxMemoryAllocSize(srcSize, dtype);
Mat src(srcSize, type);
UMat dst(srcSize, dtype);
declare.in(src, WARMUP_RNG).out(dst);
#ifdef RUN_CONVERTFP16
OCL_TEST_CYCLE() convertFp16(src, dst);
#else
OCL_TEST_CYCLE() src.convertTo(dst, dtype);
#endif
SANITY_CHECK_NOTHING();
}
OCL_PERF_TEST(Core, ConvertFP32FP16UMatMat)
{
const Size srcSize = convertFP16_srcSize;
const int type = CV_32F;
const int dtype = CV_16F;
checkDeviceMaxMemoryAllocSize(srcSize, type);
checkDeviceMaxMemoryAllocSize(srcSize, dtype);
UMat src(srcSize, type);
Mat dst(srcSize, dtype);
declare.in(src, WARMUP_RNG).out(dst);
#ifdef RUN_CONVERTFP16
OCL_TEST_CYCLE() convertFp16(src, dst);
#else
OCL_TEST_CYCLE() src.convertTo(dst, dtype);
#endif
SANITY_CHECK_NOTHING();
}
OCL_PERF_TEST(Core, ConvertFP32FP16UMatUMat)
{
const Size srcSize = convertFP16_srcSize;
const int type = CV_32F;
const int dtype = CV_16F;
checkDeviceMaxMemoryAllocSize(srcSize, type);
checkDeviceMaxMemoryAllocSize(srcSize, dtype);
UMat src(srcSize, type);
UMat dst(srcSize, dtype);
declare.in(src, WARMUP_RNG).out(dst);
#ifdef RUN_CONVERTFP16
OCL_TEST_CYCLE() convertFp16(src, dst);
#else
OCL_TEST_CYCLE() src.convertTo(dst, dtype);
#endif
SANITY_CHECK_NOTHING();
}
OCL_PERF_TEST(Core, ConvertFP16FP32MatMat)
{
const Size srcSize = convertFP16_srcSize;
const int type = CV_16F;
const int dtype = CV_32F;
checkDeviceMaxMemoryAllocSize(srcSize, type);
checkDeviceMaxMemoryAllocSize(srcSize, dtype);
Mat src(srcSize, type);
Mat dst(srcSize, dtype);
declare.in(src, WARMUP_RNG).out(dst);
#ifdef RUN_CONVERTFP16
OCL_TEST_CYCLE() convertFp16(src, dst);
#else
OCL_TEST_CYCLE() src.convertTo(dst, dtype);
#endif
SANITY_CHECK_NOTHING();
}
OCL_PERF_TEST(Core, ConvertFP16FP32MatUMat)
{
const Size srcSize = convertFP16_srcSize;
const int type = CV_16F;
const int dtype = CV_32F;
checkDeviceMaxMemoryAllocSize(srcSize, type);
checkDeviceMaxMemoryAllocSize(srcSize, dtype);
Mat src(srcSize, type);
UMat dst(srcSize, dtype);
declare.in(src, WARMUP_RNG).out(dst);
#ifdef RUN_CONVERTFP16
OCL_TEST_CYCLE() convertFp16(src, dst);
#else
OCL_TEST_CYCLE() src.convertTo(dst, dtype);
#endif
SANITY_CHECK_NOTHING();
}
OCL_PERF_TEST(Core, ConvertFP16FP32UMatMat)
{
const Size srcSize = convertFP16_srcSize;
const int type = CV_16F;
const int dtype = CV_32F;
checkDeviceMaxMemoryAllocSize(srcSize, type);
checkDeviceMaxMemoryAllocSize(srcSize, dtype);
UMat src(srcSize, type);
Mat dst(srcSize, dtype);
declare.in(src, WARMUP_RNG).out(dst);
#ifdef RUN_CONVERTFP16
OCL_TEST_CYCLE() convertFp16(src, dst);
#else
OCL_TEST_CYCLE() src.convertTo(dst, dtype);
#endif
SANITY_CHECK_NOTHING();
}
OCL_PERF_TEST(Core, ConvertFP16FP32UMatUMat)
{
const Size srcSize = convertFP16_srcSize;
const int type = CV_16F;
const int dtype = CV_32F;
checkDeviceMaxMemoryAllocSize(srcSize, type);
checkDeviceMaxMemoryAllocSize(srcSize, dtype);
UMat src(srcSize, type);
UMat dst(srcSize, dtype);
declare.in(src, WARMUP_RNG).out(dst);
#ifdef RUN_CONVERTFP16
OCL_TEST_CYCLE() convertFp16(src, dst);
#else
OCL_TEST_CYCLE() src.convertTo(dst, dtype);
#endif
SANITY_CHECK_NOTHING();
}
///////////// CopyTo ////////////////////////
typedef Size_MatType CopyToFixture;

@ -169,52 +169,130 @@ static bool ocl_convertFp16( InputArray _src, OutputArray _dst, int sdepth, int
size_t globalsize[2] = { (size_t)src.cols * cn / kercn, ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, NULL, false);
}
static bool ocl_convertTo(InputArray src_, OutputArray dst_, int ddepth, bool noScale, double alpha, double beta)
{
CV_INSTRUMENT_REGION();
CV_Assert(ddepth >= 0);
int stype = src_.type();
int sdepth = CV_MAT_DEPTH(stype);
int cn = CV_MAT_CN(stype);
int dtype = CV_MAKETYPE(ddepth, cn);
int wdepth = (sdepth == CV_64F) ? CV_64F : CV_32F;
bool needDouble = sdepth == CV_64F || ddepth == CV_64F;
bool doubleCheck = true;
if (needDouble)
{
doubleCheck = ocl::Device::getDefault().hasFP64();
}
bool halfCheck = true;
bool needHalf = sdepth == CV_16F || ddepth == CV_16F;
if (needHalf)
{
halfCheck = ocl::Device::getDefault().hasFP16();
}
if (!doubleCheck)
return false;
if (!halfCheck)
return false;
const int rowsPerWI = 4;
char cvt[2][50];
ocl::Kernel k("convertTo", ocl::core::convert_oclsrc,
format("-D srcT=%s -D WT=%s -D dstT=%s -D convertToWT=%s -D convertToDT=%s -D rowsPerWI=%d%s%s%s",
ocl::typeToStr(sdepth), ocl::typeToStr(wdepth), ocl::typeToStr(ddepth),
ocl::convertTypeStr(sdepth, wdepth, 1, cvt[0], sizeof(cvt[0])),
ocl::convertTypeStr(wdepth, ddepth, 1, cvt[1], sizeof(cvt[1])),
rowsPerWI,
needDouble ? " -D DOUBLE_SUPPORT" : "",
needHalf ? " -D HALF_SUPPORT" : "",
noScale ? " -D NO_SCALE" : ""
)
);
if (k.empty())
return false;
UMat src = src_.getUMat();
dst_.createSameSize(src_, dtype);
UMat dst = dst_.getUMat();
float alphaf = (float)alpha, betaf = (float)beta;
if (noScale)
k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst, cn));
else if (wdepth == CV_32F)
k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst, cn), alphaf, betaf);
else
k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst, cn), alpha, beta);
size_t globalsize[2] = {
(size_t)dst.cols * cn,
divUp((size_t)dst.rows, rowsPerWI)
};
if (!k.run(2, globalsize, NULL, false))
return false;
CV_IMPL_ADD(CV_IMPL_OCL);
return true;
}
#endif
void Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta) const
void Mat::convertTo(OutputArray dst, int type_, double alpha, double beta) const
{
CV_INSTRUMENT_REGION();
if( empty() )
if (empty())
{
_dst.release();
dst.release();
return;
}
bool noScale = fabs(alpha-1) < DBL_EPSILON && fabs(beta) < DBL_EPSILON;
int stype = type();
int sdepth = CV_MAT_DEPTH(stype);
if( _type < 0 )
_type = _dst.fixedType() ? _dst.type() : type();
int ddepth = sdepth;
if (type_ >= 0)
ddepth = CV_MAT_DEPTH(type_);
else
_type = CV_MAKETYPE(CV_MAT_DEPTH(_type), channels());
ddepth = dst.fixedType() ? dst.depth() : sdepth;
int sdepth = depth(), ddepth = CV_MAT_DEPTH(_type);
if( sdepth == ddepth && noScale )
bool noScale = std::fabs(alpha - 1) < DBL_EPSILON && std::fabs(beta) < DBL_EPSILON;
if (sdepth == ddepth && noScale)
{
copyTo(_dst);
copyTo(dst);
return;
}
CV_OCL_RUN(dims <= 2 && dst.isUMat(),
ocl_convertTo(*this, dst, ddepth, noScale, alpha, beta))
int cn = channels();
int dtype = CV_MAKETYPE(ddepth, cn);
Mat src = *this;
if( dims <= 2 )
_dst.create( size(), _type );
else
_dst.create( dims, size, _type );
Mat dst = _dst.getMat();
dst.create(dims, size, dtype);
Mat dstMat = dst.getMat();
BinaryFunc func = noScale ? getConvertFunc(sdepth, ddepth) : getConvertScaleFunc(sdepth, ddepth);
double scale[] = {alpha, beta};
int cn = channels();
CV_Assert( func != 0 );
if( dims <= 2 )
{
Size sz = getContinuousSize2D(src, dst, cn);
func( src.data, src.step, 0, 0, dst.data, dst.step, sz, scale );
Size sz = getContinuousSize2D(src, dstMat, cn);
func(src.data, src.step, 0, 0, dstMat.data, dstMat.step, sz, scale);
}
else
{
const Mat* arrays[] = {&src, &dst, 0};
const Mat* arrays[] = {&src, &dstMat, 0};
uchar* ptrs[2] = {};
NAryMatIterator it(arrays, ptrs);
Size sz((int)(it.size*cn), 1);
@ -224,6 +302,44 @@ void Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta) cons
}
}
void UMat::convertTo(OutputArray dst, int type_, double alpha, double beta) const
{
CV_INSTRUMENT_REGION();
if (empty())
{
dst.release();
return;
}
#ifdef HAVE_OPENCL
int stype = type();
int sdepth = CV_MAT_DEPTH(stype);
int ddepth = sdepth;
if (type_ >= 0)
ddepth = CV_MAT_DEPTH(type_);
else
ddepth = dst.fixedType() ? dst.depth() : sdepth;
bool noScale = std::fabs(alpha - 1) < DBL_EPSILON && std::fabs(beta) < DBL_EPSILON;
if (sdepth == ddepth && noScale)
{
copyTo(dst);
return;
}
CV_OCL_RUN(dims <= 2,
ocl_convertTo(*this, dst, ddepth, noScale, alpha, beta))
#endif // HAVE_OPENCL
UMat src = *this; // Fake reference to itself.
// Resolves issue 8693 in case of src == dst.
Mat m = getMat(ACCESS_READ);
m.convertTo(dst, type_, alpha, beta);
(void)src;
}
//==================================================================================================
void convertFp16(InputArray _src, OutputArray _dst)

@ -1604,6 +1604,9 @@ struct Device::Impl
pos = pos2 + 1;
}
khr_fp64_support_ = isExtensionSupported("cl_khr_fp64");
khr_fp16_support_ = isExtensionSupported("cl_khr_fp16");
intelSubgroupsSupport_ = isExtensionSupported("cl_intel_subgroups");
vendorName_ = getStrProp(CL_DEVICE_VENDOR);
@ -1692,7 +1695,9 @@ struct Device::Impl
String version_;
std::string extensions_;
int doubleFPConfig_;
bool khr_fp64_support_;
int halfFPConfig_;
bool khr_fp16_support_;
bool hostUnifiedMemory_;
int maxComputeUnits_;
size_t maxWorkGroupSize_;
@ -1844,6 +1849,11 @@ int Device::singleFPConfig() const
int Device::halfFPConfig() const
{ return p ? p->halfFPConfig_ : 0; }
bool Device::hasFP64() const
{ return p ? p->khr_fp64_support_ : false; }
bool Device::hasFP16() const
{ return p ? p->khr_fp16_support_ : false; }
bool Device::endianLittle() const
{ return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; }

@ -67,6 +67,9 @@ int Device::doubleFPConfig() const { OCL_NOT_AVAILABLE(); }
int Device::singleFPConfig() const { OCL_NOT_AVAILABLE(); }
int Device::halfFPConfig() const { OCL_NOT_AVAILABLE(); }
bool Device::hasFP64() const { OCL_NOT_AVAILABLE(); }
bool Device::hasFP16() const { OCL_NOT_AVAILABLE(); }
bool Device::endianLittle() const { OCL_NOT_AVAILABLE(); }
bool Device::errorCorrectionSupport() const { OCL_NOT_AVAILABLE(); }

@ -49,14 +49,21 @@
#endif
#endif
#ifdef HALF_SUPPORT
#ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16:enable
#endif
#endif
#define noconvert
__kernel void convertTo(__global const uchar * srcptr, int src_step, int src_offset,
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols
#ifndef NO_SCALE
WT alpha, WT beta,
, WT alpha, WT beta
#endif
int rowsPerWI)
)
{
int x = get_global_id(0);
int y0 = get_global_id(1) * rowsPerWI;

@ -1233,70 +1233,10 @@ void UMat::copyTo(OutputArray _dst, InputArray _mask) const
src.copyTo(_dst, _mask);
}
void UMat::convertTo(OutputArray _dst, int _type, double alpha, double beta) const
{
CV_INSTRUMENT_REGION();
bool noScale = std::fabs(alpha - 1) < DBL_EPSILON && std::fabs(beta) < DBL_EPSILON;
int stype = type(), cn = CV_MAT_CN(stype);
if( _type < 0 )
_type = _dst.fixedType() ? _dst.type() : stype;
else
_type = CV_MAKETYPE(CV_MAT_DEPTH(_type), cn);
int sdepth = CV_MAT_DEPTH(stype), ddepth = CV_MAT_DEPTH(_type);
if( sdepth == ddepth && noScale )
{
copyTo(_dst);
return;
}
#ifdef HAVE_OPENCL
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
bool needDouble = sdepth == CV_64F || ddepth == CV_64F;
if( dims <= 2 && cn && _dst.isUMat() && ocl::useOpenCL() &&
((needDouble && doubleSupport) || !needDouble) )
{
int wdepth = std::max(CV_32F, sdepth), rowsPerWI = 4;
char cvt[2][50];
ocl::Kernel k("convertTo", ocl::core::convert_oclsrc,
format("-D srcT=%s -D WT=%s -D dstT=%s -D convertToWT=%s -D convertToDT=%s%s%s",
ocl::typeToStr(sdepth), ocl::typeToStr(wdepth), ocl::typeToStr(ddepth),
ocl::convertTypeStr(sdepth, wdepth, 1, cvt[0], sizeof(cvt[0])),
ocl::convertTypeStr(wdepth, ddepth, 1, cvt[1], sizeof(cvt[1])),
doubleSupport ? " -D DOUBLE_SUPPORT" : "", noScale ? " -D NO_SCALE" : ""));
if (!k.empty())
{
UMat src = *this;
_dst.create( size(), _type );
UMat dst = _dst.getUMat();
float alphaf = (float)alpha, betaf = (float)beta;
ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
dstarg = ocl::KernelArg::WriteOnly(dst, cn);
if (noScale)
k.args(srcarg, dstarg, rowsPerWI);
else if (wdepth == CV_32F)
k.args(srcarg, dstarg, alphaf, betaf, rowsPerWI);
else
k.args(srcarg, dstarg, alpha, beta, rowsPerWI);
size_t globalsize[2] = { (size_t)dst.cols * cn, ((size_t)dst.rows + rowsPerWI - 1) / rowsPerWI };
if (k.run(2, globalsize, NULL, false))
{
CV_IMPL_ADD(CV_IMPL_OCL);
return;
}
}
}
#endif
UMat src = *this; // Fake reference to itself.
// Resolves issue 8693 in case of src == dst.
Mat m = getMat(ACCESS_READ);
m.convertTo(_dst, _type, alpha, beta);
}
//
// void UMat::convertTo moved to convert.dispatch.cpp
//
UMat& UMat::setTo(InputArray _value, InputArray _mask)
{

Loading…
Cancel
Save