Merge pull request #20670 from alalek:core_ocl_fix_intel_gpu_gemm_requirements

core(OpenCL): fix intel_gpu_gemm kernel requirements

* core(ocl): fix intel_gpu_gemm integration

- allow bailout to generic OpenCL kernel

* core(ocl): avoid failures of generic OpenCL gemm kernel

* core(ocl): define alignment requirements of intel_gpu_gemm kernels
pull/20687/head
Alexander Alekhin 4 years ago committed by GitHub
parent 6ace801418
commit e3f4f874c5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 69
      modules/core/src/intel_gpu_gemm.inl.hpp
  2. 153
      modules/core/src/matmul.dispatch.cpp
  3. 34
      modules/core/test/ocl/test_gemm.cpp

@ -24,11 +24,6 @@
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
#include <sstream>
#include "opencl_kernels_core.hpp"
#include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
#include "opencv2/core/opencl/runtime/opencl_core.hpp"
namespace cv namespace cv
{ {
@ -37,52 +32,79 @@ static bool intel_gpu_gemm(
UMat B, Size sizeB, UMat B, Size sizeB,
UMat D, Size sizeD, UMat D, Size sizeD,
double alpha, double beta, double alpha, double beta,
bool atrans, bool btrans) bool atrans, bool btrans,
bool& isPropagatedC2D
)
{ {
CV_UNUSED(sizeB); CV_UNUSED(sizeB);
int M = sizeD.height, N = sizeD.width, K = ((atrans)? sizeA.height : sizeA.width); int M = sizeD.height, N = sizeD.width, K = ((atrans)? sizeA.height : sizeA.width);
if (M < 4 || N < 4 || K < 4) // vload4
return false;
CV_LOG_VERBOSE(NULL, 0, "M=" << M << " N=" << N << " K=" << K);
std::string kernelName; std::string kernelName;
bool ret = true;
size_t lx = 8, ly = 4; unsigned int lx = 8, ly = 4;
size_t dx = 4, dy = 8; unsigned int dx = 4, dy = 8;
if(!atrans && !btrans) if(!atrans && !btrans)
{ {
if (M % 32 == 0 && N % 32 == 0 && K % 16 == 0) if (M % 32 == 0 && N % 32 == 0 && K % 16 == 0)
{ {
kernelName = "intelblas_gemm_buffer_NN_sp"; kernelName = "intelblas_gemm_buffer_NN_sp";
} }
else else
{ {
if (M % 2 != 0)
return false;
// vload4(0, dst_write0) - 4 cols
// multiply by lx: 8
if (N % (4*8) != 0)
return false;
kernelName = "intelblas_gemm_buffer_NN"; kernelName = "intelblas_gemm_buffer_NN";
} }
} }
else if(atrans && !btrans) else if(atrans && !btrans)
{ {
if (M % 32 != 0)
return false;
if (N % 32 != 0)
return false;
kernelName = "intelblas_gemm_buffer_TN"; kernelName = "intelblas_gemm_buffer_TN";
} }
else if(!atrans && btrans) else if(!atrans && btrans)
{ {
if (M % 128 != 0)
return false;
if (N % 8 != 0)
return false;
if (K % 512 != 0)
return false;
kernelName = "intelblas_gemm_buffer_NT"; kernelName = "intelblas_gemm_buffer_NT";
ly = 16; ly = 16;
dx = 1; dx = 1;
} }
else else
{ {
if (M % 32 != 0)
return false;
if (N % 32 != 0)
return false;
if (K % 16 != 0)
return false;
kernelName = "intelblas_gemm_buffer_TT"; kernelName = "intelblas_gemm_buffer_TT";
} }
const size_t gx = (size_t)(N + dx - 1) / dx; CV_LOG_DEBUG(NULL, "kernel: " << kernelName << " (M=" << M << " N=" << N << " K=" << K << ")");
const size_t gy = (size_t)(M + dy - 1) / dy;
size_t local[] = {lx, ly, 1}; const size_t gx = divUp((size_t)N, dx);
size_t global[] = {(gx + lx - 1) / lx * lx, (gy + ly - 1) / ly * ly, 1}; const size_t gy = divUp((size_t)M, dy);
int stride = (M * N < 1024 * 1024) ? 10000000 : 256; size_t local[] = {lx, ly, 1};
size_t global[] = {roundUp(gx, lx), roundUp(gy, ly), 1};
ocl::Queue q; ocl::Queue q;
String errmsg; String errmsg;
@ -110,10 +132,13 @@ static bool intel_gpu_gemm(
(int)(D.step / sizeof(float)) (int)(D.step / sizeof(float))
); );
ret = k.run(2, global, local, false, q); bool ret = k.run(2, global, local, false, q);
return ret;
} }
else else
{ {
int stride = (M * N < 1024 * 1024) ? 10000000 : 256;
for(int start_index = 0; start_index < K; start_index += stride) for(int start_index = 0; start_index < K; start_index += stride)
{ {
ocl::Kernel k(kernelName.c_str(), program); ocl::Kernel k(kernelName.c_str(), program);
@ -132,12 +157,16 @@ static bool intel_gpu_gemm(
(int) start_index, // 14 start_index (int) start_index, // 14 start_index
stride); stride);
ret = k.run(2, global, local, false, q); bool ret = k.run(2, global, local, false, q);
if (!ret) return ret; if (!ret)
{
if (start_index != 0)
isPropagatedC2D = false; // D array content is changed, need to rewrite
return false;
}
} }
return true;
} }
return ret;
} }
} // namespace cv } // namespace cv

@ -42,6 +42,8 @@
//M*/ //M*/
#include "precomp.hpp" #include "precomp.hpp"
#include <opencv2/core/utils/logger.hpp>
#include "opencl_kernels_core.hpp" #include "opencl_kernels_core.hpp"
#include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp" #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
#include "opencv2/core/opencl/runtime/opencl_core.hpp" #include "opencv2/core/opencl/runtime/opencl_core.hpp"
@ -155,10 +157,12 @@ static bool ocl_gemm_amdblas( InputArray matA, InputArray matB, double alpha,
static bool ocl_gemm( InputArray matA, InputArray matB, double alpha, static bool ocl_gemm( InputArray matA, InputArray matB, double alpha,
InputArray matC, double beta, OutputArray matD, int flags ) InputArray matC, double beta, OutputArray matD, int flags )
{ {
int depth = matA.depth(), cn = matA.channels(); int type = matA.type();
int type = CV_MAKETYPE(depth, cn); int depth = CV_MAT_DEPTH(type);
int cn = CV_MAT_CN(type);
CV_Assert_N( type == matB.type(), (type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2) ); CV_CheckTypeEQ(type, matB.type(), "");
CV_CheckType(type, type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2, "");
const ocl::Device & dev = ocl::Device::getDefault(); const ocl::Device & dev = ocl::Device::getDefault();
bool doubleSupport = dev.doubleFPConfig() > 0; bool doubleSupport = dev.doubleFPConfig() > 0;
@ -170,88 +174,103 @@ static bool ocl_gemm( InputArray matA, InputArray matB, double alpha,
Size sizeA = matA.size(), sizeB = matB.size(), sizeC = haveC ? matC.size() : Size(0, 0); Size sizeA = matA.size(), sizeB = matB.size(), sizeC = haveC ? matC.size() : Size(0, 0);
bool atrans = (flags & GEMM_1_T) != 0, btrans = (flags & GEMM_2_T) != 0, ctrans = (flags & GEMM_3_T) != 0; bool atrans = (flags & GEMM_1_T) != 0, btrans = (flags & GEMM_2_T) != 0, ctrans = (flags & GEMM_3_T) != 0;
CV_Assert( !haveC || matC.type() == type ); if (haveC)
CV_CheckTypeEQ(type, matC.type(), "");
Size sizeD(((btrans)? sizeB.height : sizeB.width),
((atrans)? sizeA.width : sizeA.height));
matD.create(sizeD, type);
UMat A = matA.getUMat(), B = matB.getUMat(), D = matD.getUMat();
if (!dev.intelSubgroupsSupport() || (depth == CV_64F) || cn != 1)
{
String opts;
if (atrans)
sizeA = Size(sizeA.height, sizeA.width);
if (btrans)
sizeB = Size(sizeB.height, sizeB.width);
if (haveC && ctrans)
sizeC = Size(sizeC.height, sizeC.width);
CV_Assert( sizeA.width == sizeB.height && (!haveC || sizeC == sizeD) );
int max_wg_size = (int)dev.maxWorkGroupSize();
int block_size = (max_wg_size / (32*cn) < 32) ? (max_wg_size / (16*cn) < 16) ? (max_wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32;
if (atrans)
A = A.t();
if (btrans)
B = B.t();
if (haveC) Size sizeD(((btrans) ? sizeB.height : sizeB.width),
ctrans ? transpose(matC, D) : matC.copyTo(D); ((atrans) ? sizeA.width : sizeA.height));
int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 }; if (atrans)
int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D); sizeA = Size(sizeA.height, sizeA.width);
if (btrans)
sizeB = Size(sizeB.height, sizeB.width);
if (haveC && ctrans)
sizeC = Size(sizeC.height, sizeC.width);
opts += format(" -D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d%s%s%s", CV_CheckEQ(sizeA.width, sizeB.height, "");
ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)), if (haveC)
cn, kercn, block_size, CV_CheckEQ(sizeC, sizeD, "");
(sizeA.width % block_size !=0) ? " -D NO_MULT" : "",
haveC ? " -D HAVE_C" : "",
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts); UMat A = matA.getUMat();
if (k.empty()) UMat B = matB.getUMat();
return false;
if (depth == CV_64F) matD.create(sizeD, type);
k.args(ocl::KernelArg::ReadOnlyNoSize(A), UMat D = matD.getUMat();
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
ocl::KernelArg::ReadWrite(D, cn, kercn),
sizeA.width, alpha, beta);
else
k.args(ocl::KernelArg::ReadOnlyNoSize(A),
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
ocl::KernelArg::ReadWrite(D, cn, kercn),
sizeA.width, (float)alpha, (float)beta);
size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height}; bool isPropagatedC2D = false; // D content is updated with C / C.t()
size_t localsize[2] = { (size_t)block_size, (size_t)block_size};
return k.run(2, globalsize, block_size!=1 ? localsize : NULL, false); if (dev.intelSubgroupsSupport() && (depth == CV_32F) && cn == 1)
}
else
{ {
if (haveC && beta != 0.0) if (haveC && beta != 0.0)
{ {
ctrans ? transpose(matC, D) : matC.copyTo(D); ctrans ? transpose(matC, D) : matC.copyTo(D);
isPropagatedC2D = true;
} }
else else
{ {
beta = 0.0; beta = 0.0;
} }
return intel_gpu_gemm(A, sizeA, bool res = intel_gpu_gemm(A, matA.size(),
B, sizeB, B, matB.size(),
D, sizeD, D, sizeD,
alpha, alpha,
beta, beta,
atrans, btrans); atrans, btrans,
isPropagatedC2D);
if (res)
return true;
// fallback on generic OpenCL code
} }
if (sizeD.width < 8 || sizeD.height < 8)
return false;
String opts;
int wg_size = (int)dev.maxWorkGroupSize();
int sizeDmin = std::min(sizeD.width, sizeD.height);
wg_size = std::min(wg_size, sizeDmin * sizeDmin);
int block_size = (wg_size / (32*cn) < 32) ? (wg_size / (16*cn) < 16) ? (wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32;
if (atrans)
A = A.t();
if (btrans)
B = B.t();
if (haveC && !isPropagatedC2D)
ctrans ? transpose(matC, D) : matC.copyTo(D);
int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 };
int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D);
opts += format(" -D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d%s%s%s",
ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)),
cn, kercn, block_size,
(sizeA.width % block_size !=0) ? " -D NO_MULT" : "",
haveC ? " -D HAVE_C" : "",
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts);
if (k.empty())
return false;
if (depth == CV_64F)
k.args(ocl::KernelArg::ReadOnlyNoSize(A),
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
ocl::KernelArg::ReadWrite(D, cn, kercn),
sizeA.width, alpha, beta);
else
k.args(ocl::KernelArg::ReadOnlyNoSize(A),
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
ocl::KernelArg::ReadWrite(D, cn, kercn),
sizeA.width, (float)alpha, (float)beta);
size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height};
size_t localsize[2] = { (size_t)block_size, (size_t)block_size};
return k.run(2, globalsize, block_size !=1 ? localsize : NULL, false);
} }
#endif #endif

@ -67,6 +67,8 @@ PARAM_TEST_CASE(Gemm,
double alpha, beta; double alpha, beta;
int M, N, K;
TEST_DECLARE_INPUT_PARAMETER(A); TEST_DECLARE_INPUT_PARAMETER(A);
TEST_DECLARE_INPUT_PARAMETER(B); TEST_DECLARE_INPUT_PARAMETER(B);
TEST_DECLARE_INPUT_PARAMETER(C); TEST_DECLARE_INPUT_PARAMETER(C);
@ -90,30 +92,27 @@ PARAM_TEST_CASE(Gemm,
void generateTestData() void generateTestData()
{ {
// set minimum size to 20, since testing less sizes doesn't make sense M = (int)randomDoubleLog(1, 100);
Size ARoiSize = randomSize(20, MAX_VALUE); N = (int)randomDoubleLog(1, 100);
Border ABorder = randomBorder(0, use_roi ? MAX_VALUE : 0); K = (int)randomDoubleLog(1, 1200);
randomSubMat(A, A_roi, ARoiSize, ABorder, type, -11, 11);
if (atrans) M = roundUp(M, 1);
ARoiSize = Size(ARoiSize.height, ARoiSize.width); N = roundUp(N, 1);
K = roundUp(K, 1);
Size BRoiSize = randomSize(20, MAX_VALUE); Size ARoiSize = (atrans) ? Size(M, K) : Size(K, M);
if (btrans) Border ABorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
BRoiSize.width = ARoiSize.width; randomSubMat(A, A_roi, ARoiSize, ABorder, type, -11, 11);
else
BRoiSize.height = ARoiSize.width;
Size BRoiSize = (btrans) ? Size(K, N) : Size(N, K);
Border BBorder = randomBorder(0, use_roi ? MAX_VALUE : 0); Border BBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
randomSubMat(B, B_roi, BRoiSize, BBorder, type, -11, 11); randomSubMat(B, B_roi, BRoiSize, BBorder, type, -11, 11);
if (btrans) Size CRoiSize = (ctrans) ? Size(M, N) : Size(N, M);
BRoiSize = Size(BRoiSize.height, BRoiSize.width);
Size DRoiSize = Size(BRoiSize.width, ARoiSize.height), CRoiSizeT(DRoiSize.height, DRoiSize.width);
Border CBorder = randomBorder(0, use_roi ? MAX_VALUE : 0); Border CBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
randomSubMat(C, C_roi, ctrans ? CRoiSizeT : DRoiSize, CBorder, type, -11, 11); randomSubMat(C, C_roi, CRoiSize, CBorder, type, -11, 11);
Size DRoiSize = Size(N, M);
Border DBorder = randomBorder(0, use_roi ? MAX_VALUE : 0); Border DBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
randomSubMat(D, D_roi, DRoiSize, DBorder, type, -11, 11); randomSubMat(D, D_roi, DRoiSize, DBorder, type, -11, 11);
@ -132,11 +131,12 @@ OCL_TEST_P(Gemm, Accuracy)
for (int i = 0; i < test_loop_times; ++i) for (int i = 0; i < test_loop_times; ++i)
{ {
generateTestData(); generateTestData();
SCOPED_TRACE(cv::format("i=%d: M=%d N=%d K=%d", i, M, N, K));
OCL_OFF(cv::gemm(A_roi, B_roi, alpha, C_roi, beta, D_roi, flags)); OCL_OFF(cv::gemm(A_roi, B_roi, alpha, C_roi, beta, D_roi, flags));
OCL_ON(cv::gemm(uA_roi, uB_roi, alpha, uC_roi, beta, uD_roi, flags)); OCL_ON(cv::gemm(uA_roi, uB_roi, alpha, uC_roi, beta, uD_roi, flags));
double eps = D_roi.size().area() * 1e-4; double eps = D_roi.size().area() * (1e-5 * K);
OCL_EXPECT_MATS_NEAR(D, eps); OCL_EXPECT_MATS_NEAR(D, eps);
} }
} }

Loading…
Cancel
Save