|
|
|
@ -42,6 +42,8 @@ |
|
|
|
|
//M*/
|
|
|
|
|
|
|
|
|
|
#include "precomp.hpp" |
|
|
|
|
#include <opencv2/core/utils/logger.hpp> |
|
|
|
|
|
|
|
|
|
#include "opencl_kernels_core.hpp" |
|
|
|
|
#include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp" |
|
|
|
|
#include "opencv2/core/opencl/runtime/opencl_core.hpp" |
|
|
|
@ -155,10 +157,12 @@ static bool ocl_gemm_amdblas( InputArray matA, InputArray matB, double alpha, |
|
|
|
|
static bool ocl_gemm( InputArray matA, InputArray matB, double alpha, |
|
|
|
|
InputArray matC, double beta, OutputArray matD, int flags ) |
|
|
|
|
{ |
|
|
|
|
int depth = matA.depth(), cn = matA.channels(); |
|
|
|
|
int type = CV_MAKETYPE(depth, cn); |
|
|
|
|
int type = matA.type(); |
|
|
|
|
int depth = CV_MAT_DEPTH(type); |
|
|
|
|
int cn = CV_MAT_CN(type); |
|
|
|
|
|
|
|
|
|
CV_Assert_N( type == matB.type(), (type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2) ); |
|
|
|
|
CV_CheckTypeEQ(type, matB.type(), ""); |
|
|
|
|
CV_CheckType(type, type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2, ""); |
|
|
|
|
|
|
|
|
|
const ocl::Device & dev = ocl::Device::getDefault(); |
|
|
|
|
bool doubleSupport = dev.doubleFPConfig() > 0; |
|
|
|
@ -170,88 +174,103 @@ static bool ocl_gemm( InputArray matA, InputArray matB, double alpha, |
|
|
|
|
Size sizeA = matA.size(), sizeB = matB.size(), sizeC = haveC ? matC.size() : Size(0, 0); |
|
|
|
|
bool atrans = (flags & GEMM_1_T) != 0, btrans = (flags & GEMM_2_T) != 0, ctrans = (flags & GEMM_3_T) != 0; |
|
|
|
|
|
|
|
|
|
CV_Assert( !haveC || matC.type() == type ); |
|
|
|
|
|
|
|
|
|
Size sizeD(((btrans)? sizeB.height : sizeB.width), |
|
|
|
|
((atrans)? sizeA.width : sizeA.height)); |
|
|
|
|
matD.create(sizeD, type); |
|
|
|
|
|
|
|
|
|
UMat A = matA.getUMat(), B = matB.getUMat(), D = matD.getUMat(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (!dev.intelSubgroupsSupport() || (depth == CV_64F) || cn != 1) |
|
|
|
|
{ |
|
|
|
|
String opts; |
|
|
|
|
|
|
|
|
|
if (atrans) |
|
|
|
|
sizeA = Size(sizeA.height, sizeA.width); |
|
|
|
|
if (btrans) |
|
|
|
|
sizeB = Size(sizeB.height, sizeB.width); |
|
|
|
|
if (haveC && ctrans) |
|
|
|
|
sizeC = Size(sizeC.height, sizeC.width); |
|
|
|
|
|
|
|
|
|
CV_Assert( sizeA.width == sizeB.height && (!haveC || sizeC == sizeD) ); |
|
|
|
|
|
|
|
|
|
int max_wg_size = (int)dev.maxWorkGroupSize(); |
|
|
|
|
int block_size = (max_wg_size / (32*cn) < 32) ? (max_wg_size / (16*cn) < 16) ? (max_wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32; |
|
|
|
|
|
|
|
|
|
if (atrans) |
|
|
|
|
A = A.t(); |
|
|
|
|
|
|
|
|
|
if (btrans) |
|
|
|
|
B = B.t(); |
|
|
|
|
if (haveC) |
|
|
|
|
CV_CheckTypeEQ(type, matC.type(), ""); |
|
|
|
|
|
|
|
|
|
if (haveC) |
|
|
|
|
ctrans ? transpose(matC, D) : matC.copyTo(D); |
|
|
|
|
Size sizeD(((btrans) ? sizeB.height : sizeB.width), |
|
|
|
|
((atrans) ? sizeA.width : sizeA.height)); |
|
|
|
|
|
|
|
|
|
int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 }; |
|
|
|
|
int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D); |
|
|
|
|
if (atrans) |
|
|
|
|
sizeA = Size(sizeA.height, sizeA.width); |
|
|
|
|
if (btrans) |
|
|
|
|
sizeB = Size(sizeB.height, sizeB.width); |
|
|
|
|
if (haveC && ctrans) |
|
|
|
|
sizeC = Size(sizeC.height, sizeC.width); |
|
|
|
|
|
|
|
|
|
opts += format(" -D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d%s%s%s", |
|
|
|
|
ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)), |
|
|
|
|
cn, kercn, block_size, |
|
|
|
|
(sizeA.width % block_size !=0) ? " -D NO_MULT" : "", |
|
|
|
|
haveC ? " -D HAVE_C" : "", |
|
|
|
|
doubleSupport ? " -D DOUBLE_SUPPORT" : ""); |
|
|
|
|
CV_CheckEQ(sizeA.width, sizeB.height, ""); |
|
|
|
|
if (haveC) |
|
|
|
|
CV_CheckEQ(sizeC, sizeD, ""); |
|
|
|
|
|
|
|
|
|
ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts); |
|
|
|
|
if (k.empty()) |
|
|
|
|
return false; |
|
|
|
|
UMat A = matA.getUMat(); |
|
|
|
|
UMat B = matB.getUMat(); |
|
|
|
|
|
|
|
|
|
if (depth == CV_64F) |
|
|
|
|
k.args(ocl::KernelArg::ReadOnlyNoSize(A), |
|
|
|
|
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn), |
|
|
|
|
ocl::KernelArg::ReadWrite(D, cn, kercn), |
|
|
|
|
sizeA.width, alpha, beta); |
|
|
|
|
else |
|
|
|
|
k.args(ocl::KernelArg::ReadOnlyNoSize(A), |
|
|
|
|
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn), |
|
|
|
|
ocl::KernelArg::ReadWrite(D, cn, kercn), |
|
|
|
|
sizeA.width, (float)alpha, (float)beta); |
|
|
|
|
matD.create(sizeD, type); |
|
|
|
|
UMat D = matD.getUMat(); |
|
|
|
|
|
|
|
|
|
size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height}; |
|
|
|
|
size_t localsize[2] = { (size_t)block_size, (size_t)block_size}; |
|
|
|
|
bool isPropagatedC2D = false; // D content is updated with C / C.t()
|
|
|
|
|
|
|
|
|
|
return k.run(2, globalsize, block_size!=1 ? localsize : NULL, false); |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
if (dev.intelSubgroupsSupport() && (depth == CV_32F) && cn == 1) |
|
|
|
|
{ |
|
|
|
|
if (haveC && beta != 0.0) |
|
|
|
|
{ |
|
|
|
|
ctrans ? transpose(matC, D) : matC.copyTo(D); |
|
|
|
|
isPropagatedC2D = true; |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
{ |
|
|
|
|
beta = 0.0; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return intel_gpu_gemm(A, sizeA, |
|
|
|
|
B, sizeB, |
|
|
|
|
D, sizeD, |
|
|
|
|
alpha, |
|
|
|
|
beta, |
|
|
|
|
atrans, btrans); |
|
|
|
|
bool res = intel_gpu_gemm(A, matA.size(), |
|
|
|
|
B, matB.size(), |
|
|
|
|
D, sizeD, |
|
|
|
|
alpha, |
|
|
|
|
beta, |
|
|
|
|
atrans, btrans, |
|
|
|
|
isPropagatedC2D); |
|
|
|
|
if (res) |
|
|
|
|
return true; |
|
|
|
|
// fallback on generic OpenCL code
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (sizeD.width < 8 || sizeD.height < 8) |
|
|
|
|
return false; |
|
|
|
|
|
|
|
|
|
String opts; |
|
|
|
|
|
|
|
|
|
int wg_size = (int)dev.maxWorkGroupSize(); |
|
|
|
|
int sizeDmin = std::min(sizeD.width, sizeD.height); |
|
|
|
|
wg_size = std::min(wg_size, sizeDmin * sizeDmin); |
|
|
|
|
int block_size = (wg_size / (32*cn) < 32) ? (wg_size / (16*cn) < 16) ? (wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32; |
|
|
|
|
|
|
|
|
|
if (atrans) |
|
|
|
|
A = A.t(); |
|
|
|
|
|
|
|
|
|
if (btrans) |
|
|
|
|
B = B.t(); |
|
|
|
|
|
|
|
|
|
if (haveC && !isPropagatedC2D) |
|
|
|
|
ctrans ? transpose(matC, D) : matC.copyTo(D); |
|
|
|
|
|
|
|
|
|
int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 }; |
|
|
|
|
int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D); |
|
|
|
|
|
|
|
|
|
opts += format(" -D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d%s%s%s", |
|
|
|
|
ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)), |
|
|
|
|
cn, kercn, block_size, |
|
|
|
|
(sizeA.width % block_size !=0) ? " -D NO_MULT" : "", |
|
|
|
|
haveC ? " -D HAVE_C" : "", |
|
|
|
|
doubleSupport ? " -D DOUBLE_SUPPORT" : ""); |
|
|
|
|
|
|
|
|
|
ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts); |
|
|
|
|
if (k.empty()) |
|
|
|
|
return false; |
|
|
|
|
|
|
|
|
|
if (depth == CV_64F) |
|
|
|
|
k.args(ocl::KernelArg::ReadOnlyNoSize(A), |
|
|
|
|
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn), |
|
|
|
|
ocl::KernelArg::ReadWrite(D, cn, kercn), |
|
|
|
|
sizeA.width, alpha, beta); |
|
|
|
|
else |
|
|
|
|
k.args(ocl::KernelArg::ReadOnlyNoSize(A), |
|
|
|
|
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn), |
|
|
|
|
ocl::KernelArg::ReadWrite(D, cn, kercn), |
|
|
|
|
sizeA.width, (float)alpha, (float)beta); |
|
|
|
|
|
|
|
|
|
size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height}; |
|
|
|
|
size_t localsize[2] = { (size_t)block_size, (size_t)block_size}; |
|
|
|
|
|
|
|
|
|
return k.run(2, globalsize, block_size !=1 ? localsize : NULL, false); |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|