From e3f4f874c5214397a41befa6735f31382e7c39d2 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Fri, 10 Sep 2021 15:00:11 +0300 Subject: [PATCH] Merge pull request #20670 from alalek:core_ocl_fix_intel_gpu_gemm_requirements core(OpenCL): fix intel_gpu_gemm kernel requirements * core(ocl): fix intel_gpu_gemm integration - allow bailout to generic OpenCL kernel * core(ocl): avoid failures of generic OpenCL gemm kernel * core(ocl): define alignment requirements of intel_gpu_gemm kernels --- modules/core/src/intel_gpu_gemm.inl.hpp | 69 +++++++---- modules/core/src/matmul.dispatch.cpp | 153 +++++++++++++----------- modules/core/test/ocl/test_gemm.cpp | 34 +++--- 3 files changed, 152 insertions(+), 104 deletions(-) diff --git a/modules/core/src/intel_gpu_gemm.inl.hpp b/modules/core/src/intel_gpu_gemm.inl.hpp index fbd567b949..fa66856f5e 100644 --- a/modules/core/src/intel_gpu_gemm.inl.hpp +++ b/modules/core/src/intel_gpu_gemm.inl.hpp @@ -24,11 +24,6 @@ #ifdef HAVE_OPENCL -#include -#include "opencl_kernels_core.hpp" -#include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp" -#include "opencv2/core/opencl/runtime/opencl_core.hpp" - namespace cv { @@ -37,52 +32,79 @@ static bool intel_gpu_gemm( UMat B, Size sizeB, UMat D, Size sizeD, double alpha, double beta, - bool atrans, bool btrans) + bool atrans, bool btrans, + bool& isPropagatedC2D +) { CV_UNUSED(sizeB); int M = sizeD.height, N = sizeD.width, K = ((atrans)? sizeA.height : sizeA.width); + if (M < 4 || N < 4 || K < 4) // vload4 + return false; + + CV_LOG_VERBOSE(NULL, 0, "M=" << M << " N=" << N << " K=" << K); + std::string kernelName; - bool ret = true; - size_t lx = 8, ly = 4; - size_t dx = 4, dy = 8; + unsigned int lx = 8, ly = 4; + unsigned int dx = 4, dy = 8; if(!atrans && !btrans) { - if (M % 32 == 0 && N % 32 == 0 && K % 16 == 0) { kernelName = "intelblas_gemm_buffer_NN_sp"; } else { + if (M % 2 != 0) + return false; + // vload4(0, dst_write0) - 4 cols + // multiply by lx: 8 + if (N % (4*8) != 0) + return false; kernelName = "intelblas_gemm_buffer_NN"; } } else if(atrans && !btrans) { + if (M % 32 != 0) + return false; + if (N % 32 != 0) + return false; kernelName = "intelblas_gemm_buffer_TN"; } else if(!atrans && btrans) { + if (M % 128 != 0) + return false; + if (N % 8 != 0) + return false; + if (K % 512 != 0) + return false; kernelName = "intelblas_gemm_buffer_NT"; ly = 16; dx = 1; } else { + if (M % 32 != 0) + return false; + if (N % 32 != 0) + return false; + if (K % 16 != 0) + return false; kernelName = "intelblas_gemm_buffer_TT"; } - const size_t gx = (size_t)(N + dx - 1) / dx; - const size_t gy = (size_t)(M + dy - 1) / dy; + CV_LOG_DEBUG(NULL, "kernel: " << kernelName << " (M=" << M << " N=" << N << " K=" << K << ")"); - size_t local[] = {lx, ly, 1}; - size_t global[] = {(gx + lx - 1) / lx * lx, (gy + ly - 1) / ly * ly, 1}; + const size_t gx = divUp((size_t)N, dx); + const size_t gy = divUp((size_t)M, dy); - int stride = (M * N < 1024 * 1024) ? 10000000 : 256; + size_t local[] = {lx, ly, 1}; + size_t global[] = {roundUp(gx, lx), roundUp(gy, ly), 1}; ocl::Queue q; String errmsg; @@ -110,10 +132,13 @@ static bool intel_gpu_gemm( (int)(D.step / sizeof(float)) ); - ret = k.run(2, global, local, false, q); + bool ret = k.run(2, global, local, false, q); + return ret; } else { + int stride = (M * N < 1024 * 1024) ? 10000000 : 256; + for(int start_index = 0; start_index < K; start_index += stride) { ocl::Kernel k(kernelName.c_str(), program); @@ -132,12 +157,16 @@ static bool intel_gpu_gemm( (int) start_index, // 14 start_index stride); - ret = k.run(2, global, local, false, q); - if (!ret) return ret; + bool ret = k.run(2, global, local, false, q); + if (!ret) + { + if (start_index != 0) + isPropagatedC2D = false; // D array content is changed, need to rewrite + return false; + } } + return true; } - - return ret; } } // namespace cv diff --git a/modules/core/src/matmul.dispatch.cpp b/modules/core/src/matmul.dispatch.cpp index e81064ec16..a7447330fc 100644 --- a/modules/core/src/matmul.dispatch.cpp +++ b/modules/core/src/matmul.dispatch.cpp @@ -42,6 +42,8 @@ //M*/ #include "precomp.hpp" +#include + #include "opencl_kernels_core.hpp" #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp" #include "opencv2/core/opencl/runtime/opencl_core.hpp" @@ -155,10 +157,12 @@ static bool ocl_gemm_amdblas( InputArray matA, InputArray matB, double alpha, static bool ocl_gemm( InputArray matA, InputArray matB, double alpha, InputArray matC, double beta, OutputArray matD, int flags ) { - int depth = matA.depth(), cn = matA.channels(); - int type = CV_MAKETYPE(depth, cn); + int type = matA.type(); + int depth = CV_MAT_DEPTH(type); + int cn = CV_MAT_CN(type); - CV_Assert_N( type == matB.type(), (type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2) ); + CV_CheckTypeEQ(type, matB.type(), ""); + CV_CheckType(type, type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2, ""); const ocl::Device & dev = ocl::Device::getDefault(); bool doubleSupport = dev.doubleFPConfig() > 0; @@ -170,88 +174,103 @@ static bool ocl_gemm( InputArray matA, InputArray matB, double alpha, Size sizeA = matA.size(), sizeB = matB.size(), sizeC = haveC ? matC.size() : Size(0, 0); bool atrans = (flags & GEMM_1_T) != 0, btrans = (flags & GEMM_2_T) != 0, ctrans = (flags & GEMM_3_T) != 0; - CV_Assert( !haveC || matC.type() == type ); - - Size sizeD(((btrans)? sizeB.height : sizeB.width), - ((atrans)? sizeA.width : sizeA.height)); - matD.create(sizeD, type); - - UMat A = matA.getUMat(), B = matB.getUMat(), D = matD.getUMat(); - - - if (!dev.intelSubgroupsSupport() || (depth == CV_64F) || cn != 1) - { - String opts; - - if (atrans) - sizeA = Size(sizeA.height, sizeA.width); - if (btrans) - sizeB = Size(sizeB.height, sizeB.width); - if (haveC && ctrans) - sizeC = Size(sizeC.height, sizeC.width); - - CV_Assert( sizeA.width == sizeB.height && (!haveC || sizeC == sizeD) ); - - int max_wg_size = (int)dev.maxWorkGroupSize(); - int block_size = (max_wg_size / (32*cn) < 32) ? (max_wg_size / (16*cn) < 16) ? (max_wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32; - - if (atrans) - A = A.t(); - - if (btrans) - B = B.t(); + if (haveC) + CV_CheckTypeEQ(type, matC.type(), ""); - if (haveC) - ctrans ? transpose(matC, D) : matC.copyTo(D); + Size sizeD(((btrans) ? sizeB.height : sizeB.width), + ((atrans) ? sizeA.width : sizeA.height)); - int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 }; - int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D); + if (atrans) + sizeA = Size(sizeA.height, sizeA.width); + if (btrans) + sizeB = Size(sizeB.height, sizeB.width); + if (haveC && ctrans) + sizeC = Size(sizeC.height, sizeC.width); - opts += format(" -D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d%s%s%s", - ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)), - cn, kercn, block_size, - (sizeA.width % block_size !=0) ? " -D NO_MULT" : "", - haveC ? " -D HAVE_C" : "", - doubleSupport ? " -D DOUBLE_SUPPORT" : ""); + CV_CheckEQ(sizeA.width, sizeB.height, ""); + if (haveC) + CV_CheckEQ(sizeC, sizeD, ""); - ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts); - if (k.empty()) - return false; + UMat A = matA.getUMat(); + UMat B = matB.getUMat(); - if (depth == CV_64F) - k.args(ocl::KernelArg::ReadOnlyNoSize(A), - ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn), - ocl::KernelArg::ReadWrite(D, cn, kercn), - sizeA.width, alpha, beta); - else - k.args(ocl::KernelArg::ReadOnlyNoSize(A), - ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn), - ocl::KernelArg::ReadWrite(D, cn, kercn), - sizeA.width, (float)alpha, (float)beta); + matD.create(sizeD, type); + UMat D = matD.getUMat(); - size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height}; - size_t localsize[2] = { (size_t)block_size, (size_t)block_size}; + bool isPropagatedC2D = false; // D content is updated with C / C.t() - return k.run(2, globalsize, block_size!=1 ? localsize : NULL, false); - } - else + if (dev.intelSubgroupsSupport() && (depth == CV_32F) && cn == 1) { if (haveC && beta != 0.0) { ctrans ? transpose(matC, D) : matC.copyTo(D); + isPropagatedC2D = true; } else { beta = 0.0; } - return intel_gpu_gemm(A, sizeA, - B, sizeB, - D, sizeD, - alpha, - beta, - atrans, btrans); + bool res = intel_gpu_gemm(A, matA.size(), + B, matB.size(), + D, sizeD, + alpha, + beta, + atrans, btrans, + isPropagatedC2D); + if (res) + return true; + // fallback on generic OpenCL code } + + if (sizeD.width < 8 || sizeD.height < 8) + return false; + + String opts; + + int wg_size = (int)dev.maxWorkGroupSize(); + int sizeDmin = std::min(sizeD.width, sizeD.height); + wg_size = std::min(wg_size, sizeDmin * sizeDmin); + int block_size = (wg_size / (32*cn) < 32) ? (wg_size / (16*cn) < 16) ? (wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32; + + if (atrans) + A = A.t(); + + if (btrans) + B = B.t(); + + if (haveC && !isPropagatedC2D) + ctrans ? transpose(matC, D) : matC.copyTo(D); + + int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 }; + int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D); + + opts += format(" -D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d%s%s%s", + ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)), + cn, kercn, block_size, + (sizeA.width % block_size !=0) ? " -D NO_MULT" : "", + haveC ? " -D HAVE_C" : "", + doubleSupport ? " -D DOUBLE_SUPPORT" : ""); + + ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts); + if (k.empty()) + return false; + + if (depth == CV_64F) + k.args(ocl::KernelArg::ReadOnlyNoSize(A), + ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn), + ocl::KernelArg::ReadWrite(D, cn, kercn), + sizeA.width, alpha, beta); + else + k.args(ocl::KernelArg::ReadOnlyNoSize(A), + ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn), + ocl::KernelArg::ReadWrite(D, cn, kercn), + sizeA.width, (float)alpha, (float)beta); + + size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height}; + size_t localsize[2] = { (size_t)block_size, (size_t)block_size}; + + return k.run(2, globalsize, block_size !=1 ? localsize : NULL, false); } #endif diff --git a/modules/core/test/ocl/test_gemm.cpp b/modules/core/test/ocl/test_gemm.cpp index 825b506780..cb7cb0be1a 100644 --- a/modules/core/test/ocl/test_gemm.cpp +++ b/modules/core/test/ocl/test_gemm.cpp @@ -67,6 +67,8 @@ PARAM_TEST_CASE(Gemm, double alpha, beta; + int M, N, K; + TEST_DECLARE_INPUT_PARAMETER(A); TEST_DECLARE_INPUT_PARAMETER(B); TEST_DECLARE_INPUT_PARAMETER(C); @@ -90,30 +92,27 @@ PARAM_TEST_CASE(Gemm, void generateTestData() { - // set minimum size to 20, since testing less sizes doesn't make sense - Size ARoiSize = randomSize(20, MAX_VALUE); - Border ABorder = randomBorder(0, use_roi ? MAX_VALUE : 0); - randomSubMat(A, A_roi, ARoiSize, ABorder, type, -11, 11); + M = (int)randomDoubleLog(1, 100); + N = (int)randomDoubleLog(1, 100); + K = (int)randomDoubleLog(1, 1200); - if (atrans) - ARoiSize = Size(ARoiSize.height, ARoiSize.width); + M = roundUp(M, 1); + N = roundUp(N, 1); + K = roundUp(K, 1); - Size BRoiSize = randomSize(20, MAX_VALUE); - if (btrans) - BRoiSize.width = ARoiSize.width; - else - BRoiSize.height = ARoiSize.width; + Size ARoiSize = (atrans) ? Size(M, K) : Size(K, M); + Border ABorder = randomBorder(0, use_roi ? MAX_VALUE : 0); + randomSubMat(A, A_roi, ARoiSize, ABorder, type, -11, 11); + Size BRoiSize = (btrans) ? Size(K, N) : Size(N, K); Border BBorder = randomBorder(0, use_roi ? MAX_VALUE : 0); randomSubMat(B, B_roi, BRoiSize, BBorder, type, -11, 11); - if (btrans) - BRoiSize = Size(BRoiSize.height, BRoiSize.width); - - Size DRoiSize = Size(BRoiSize.width, ARoiSize.height), CRoiSizeT(DRoiSize.height, DRoiSize.width); + Size CRoiSize = (ctrans) ? Size(M, N) : Size(N, M); Border CBorder = randomBorder(0, use_roi ? MAX_VALUE : 0); - randomSubMat(C, C_roi, ctrans ? CRoiSizeT : DRoiSize, CBorder, type, -11, 11); + randomSubMat(C, C_roi, CRoiSize, CBorder, type, -11, 11); + Size DRoiSize = Size(N, M); Border DBorder = randomBorder(0, use_roi ? MAX_VALUE : 0); randomSubMat(D, D_roi, DRoiSize, DBorder, type, -11, 11); @@ -132,11 +131,12 @@ OCL_TEST_P(Gemm, Accuracy) for (int i = 0; i < test_loop_times; ++i) { generateTestData(); + SCOPED_TRACE(cv::format("i=%d: M=%d N=%d K=%d", i, M, N, K)); OCL_OFF(cv::gemm(A_roi, B_roi, alpha, C_roi, beta, D_roi, flags)); OCL_ON(cv::gemm(uA_roi, uB_roi, alpha, uC_roi, beta, uD_roi, flags)); - double eps = D_roi.size().area() * 1e-4; + double eps = D_roi.size().area() * (1e-5 * K); OCL_EXPECT_MATS_NEAR(D, eps); } }