Merge pull request #25271 from fengyuentau:matmul_bias

Merge with https://github.com/opencv/opencv_extra/pull/1158

Todo:

- [x] Fix Attention pattern recognition.
- [x] Handle other backends.

Benchmark:

"VIT_B_32 OCV/CPU", M1, results in milliseconds.

| Model | 4.x | This PR |
| - | - | - |
| VIT_B_32 OCV/CPU | 87.66 | **83.83** |


### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [x] The feature is well documented and sample code can be built with the project CMake
pull/25299/head
Yuantao Feng 8 months ago committed by GitHub
parent 9716bf95ae
commit b758897c29
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 26
      modules/dnn/src/cuda4dnn/primitives/matmul_broadcast.hpp
  2. 198
      modules/dnn/src/layers/matmul_layer.cpp
  3. 136
      modules/dnn/src/onnx/onnx_graph_simplifier.cpp
  4. 13
      modules/dnn/src/onnx/onnx_importer.cpp
  5. 7
      modules/dnn/test/test_graph_simplifier.cpp
  6. 6
      modules/dnn/test/test_onnx_importer.cpp

@ -12,6 +12,8 @@
#include "../csl/tensor.hpp"
#include "../csl/tensor_ops.hpp"
#include "../kernels/eltwise_ops.hpp" // for adding bias
#include <opencv2/core.hpp>
#include <utility>
@ -23,7 +25,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
public:
using wrapper_type = GetCUDABackendWrapperType<T>;
MatMulBroadcastOp(csl::Stream stream_, csl::cublas::Handle handle, const Mat &B, bool _transA, bool _transB,
MatMulBroadcastOp(csl::Stream stream_, csl::cublas::Handle handle, const Mat &B, const Mat &bias, bool _transA, bool _transB,
const std::vector<size_t> &A_offsets_, const std::vector<size_t> &B_offsets_, std::vector<size_t> &C_offsets_,
size_t batch_)
: stream(std::move(stream_)), cublasHandle(std::move(handle)), A_offsets(A_offsets_), B_offsets(B_offsets_), C_offsets(C_offsets_), batch(batch_)
@ -33,6 +35,11 @@ namespace cv { namespace dnn { namespace cuda4dnn {
csl::copyMatToTensor<T>(B, input_B_tensor, stream);
}
if (!bias.empty()) {
bias_tensor = csl::makeTensorHeader<T>(bias);
csl::copyMatToTensor<T>(bias, bias_tensor, stream);
}
transA = _transA;
transB = _transB;
}
@ -42,9 +49,6 @@ namespace cv { namespace dnn { namespace cuda4dnn {
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
csl::Workspace& workspace) override
{
CV_Assert(((inputs.size() == 2 && input_B_tensor.empty()) ||
(inputs.size() == 1 && !input_B_tensor.empty())) && outputs.size() == 1);
auto input_A_wrapper = inputs[0].dynamicCast<wrapper_type>();
auto input_A = input_A_wrapper->getView();
@ -60,12 +64,26 @@ namespace cv { namespace dnn { namespace cuda4dnn {
auto output = output_wrapper->getSpan();
csl::tensor_ops::gemmBatched<T>(cublasHandle, batch, 0.f, output, C_offsets, 1.f, transA, input_A, A_offsets, transB, input_B, B_offsets);
// add bias if exists
if (!bias_tensor.empty() || inputs.size() >= 3) {
csl::TensorView<T> bias;
if (bias_tensor.empty()) {
auto bias_wrapper = inputs[2].dynamicCast<wrapper_type>();
bias = bias_wrapper->getView();
} else {
bias = csl::TensorView<T>(bias_tensor);
}
kernels::eltwise_sum_2<T>(stream, output, output, bias);
}
}
private:
csl::Stream stream;
csl::cublas::Handle cublasHandle;
csl::Tensor<T> input_B_tensor;
csl::Tensor<T> bias_tensor;
bool transA, transB;
std::vector<size_t> A_offsets;

@ -26,6 +26,10 @@ using namespace cv::dnn::cuda4dnn;
namespace cv { namespace dnn {
class MatMulLayerImpl CV_FINAL : public MatMulLayer {
#ifdef HAVE_OPENCL
UMat weight_umat, bias_umat;
#endif
public:
MatMulLayerImpl(const LayerParams& params) {
setParamsFrom(params);
@ -34,6 +38,8 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
trans_b = params.get<bool>("transB", false);
alpha = params.get<float>("alpha", 1.f);
beta = params.get<float>("beta", 1.f);
real_ndims_C = params.get<int>("real_ndims_C", -1);
}
virtual bool supportBackend(int backendId) CV_OVERRIDE {
@ -48,8 +54,9 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
const int requiredOutputs,
std::vector<MatShape> &outputs,
std::vector<MatShape> &internals) const CV_OVERRIDE {
CV_CheckGE(inputs.size(), static_cast<size_t>(1), "DNN/MatMul: one varible input at least");
CV_CheckLE(inputs.size(), static_cast<size_t>(2), "DNN/MatMul: two variable inputs at most");
int num_inputs = inputs.size() + blobs.size();
CV_CheckGE(num_inputs, 2, "DNN/MatMul: two inputs at least");
CV_CheckLE(num_inputs, 3, "DNN/MatMul: three inputs at most");
const auto shape_A = inputs[0], shape_B = blobs.empty() ? inputs[1] : shape(blobs[0]);
CV_CheckGE(shape_A.size(), static_cast<size_t>(2), "DNN/MatMul: invalid shape of input A");
@ -64,7 +71,7 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
int K_B = trans_b ? nB : mB;
CV_CheckEQ(K_A, K_B, "DNN/MatMul: invalid dimension K");
// Check legal broadcast. It is legal for sure if A and B are 2d, or one of them is 2d.
// Check if inputs are broadcastable.
MatShape common_shape;
if (shape_A.size() != 2 || shape_B.size() != 2) {
const auto &shape_more_dims = shape_A.size() > shape_B.size() ? shape_A : shape_B;
@ -89,6 +96,24 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
common_shape[1] = N;
}
// Check if bias is broadcastable
if (num_inputs == 3) {
const auto shape_C = blobs.empty() ? inputs.back() : shape(blobs.back());
if (real_ndims_C == 1) { // (1) or (N)
CV_Check(shape_C[0], shape_C[0] == 1 || shape_C[0] == N, "DNN/MatMul: invalid dimension of C");
} else if (real_ndims_C >= 2) {
const auto &shape_large = common_shape.size() > shape_C.size() ? common_shape : shape_C;
const auto &shape_small = common_shape.size() > shape_C.size() ? shape_C : common_shape;
size_t diff_dims = shape_large.size() - shape_small.size();
for (size_t i = 0; i < shape_small.size(); i++) {
const auto dl = shape_small[i], dm = shape_large[i + diff_dims];
if (dl != 1 && dm != 1 && dl != dm) {
CV_Error(Error::StsBadSize, "DNN/MatMul: invalid shape of C");
}
}
}
}
outputs.assign(1, common_shape);
return false;
}
@ -109,6 +134,44 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
fastGemmPackB(blobs[0], packed_input_B, trans_b, opt);
helper.updatePackedBOffsets(packed_input_B.size());
}
// broadcast bias if needed
if ((inputs.size() + blobs.size()) >= 3 && blobs.size() >= 2) {
const auto bias_mat = blobs.back();
const auto bias_shape = shape(bias_mat);
bool is_broadcast_needed = real_ndims_C == 0 || real_ndims_C == 1 || (total(bias_shape) != total(C_shape) || bias_shape.size() != C_shape.size());
if (is_broadcast_needed) {
broadcast_bias = Mat(C_shape, CV_32F);
auto *broadcast_bias_ptr = broadcast_bias.ptr<float>();
const auto *bias = bias_mat.ptr<const float>();
if (bias_mat.total() == 1) { // [], [1], [1, ...]
float b = (*bias) * beta;
for (size_t i = 0; i < broadcast_bias.total(); i++) {
broadcast_bias_ptr[i] = b;
}
} else if (real_ndims_C == 1) { // [n]
size_t inner_size = C_shape.back(),
loops = total(C_shape) / inner_size;
for (size_t i = 0; i < loops; i++) {
size_t step = i * inner_size;
for (size_t j = 0; j < inner_size; j++) {
broadcast_bias_ptr[step + j] = beta * bias[j];
}
}
} else {
broadcast(bias_mat, C_shape, broadcast_bias);
}
} else {
broadcast_bias = blobs.back();
}
}
#ifdef HAVE_OPENCL
weight_umat.release();
bias_umat.release();
#endif
}
// works like Y = numpy.matmul(A, B)
@ -134,7 +197,38 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
const auto *a = A.ptr<const float>();
auto *y = Y.ptr<float>();
std::memset(y, 0, Y.total() * sizeof(float));
// add bias if existed
if ((inputs.size() + blobs.size()) >= 3) {
const auto &shape_Y = shape(Y);
if (blobs.empty()) { // bias from input
const auto &bias_mat = inputs.back();
const auto *bias = bias_mat.ptr<const float>();
if (bias_mat.total() == 1) { // [], [1], [1, ...]
float b = (*bias) * beta;
for (size_t i = 0; i < Y.total(); i++) {
y[i] = b;
}
} else if (real_ndims_C == 1) { // [n]
const size_t inner_size = shape_Y.back(),
batches = total(Y) / inner_size;
parallel_for_(Range(0, batches), [&] (const Range &r) {
for (int i = r.start; i < r.end; i++) {
const size_t output_offset = i * inner_size;
for (size_t j = 0; j < inner_size; j++) {
y[output_offset + j] = beta * bias[j];
}
}
}, double(batches * inner_size * (1 / 1024.0)));
} else {
broadcast(bias_mat, shape_Y, Y);
}
} else { // bias from constant
const auto *bias = broadcast_bias.ptr<const float>();
std::memcpy(y, bias, total(shape_Y) * sizeof(float));
}
} else {
std::memset(y, 0, Y.total() * sizeof(float));
}
if (blobs.empty()) {
const auto &B = inputs[1];
@ -158,14 +252,36 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
inputs_arr.getUMatVector(inputs);
outputs_arr.getUMatVector(outputs);
// does not support bias as input
if (inputs.size() >= 3) {
return false;
}
const auto &input_A = inputs[0];
UMat input_B;
auto &output = outputs[0];
const auto output_shape = shape(output);
if (blobs.empty()) {
input_B = inputs[1];
weight_umat = inputs[1];
if ((inputs.size() + blobs.size() >= 3)) {
bias_umat = UMat::zeros(output_shape.size(), output_shape.data(), CV_32F);
}
} else {
blobs[0].copyTo(input_B);
if (weight_umat.empty()) {
blobs.front().copyTo(weight_umat);
}
if ((inputs.size() + blobs.size() >= 3)) {
if (bias_umat.empty()) {
broadcast_bias.copyTo(bias_umat);
}
} else {
if (bias_umat.empty()) {
bias_umat = UMat::zeros(output_shape.size(), output_shape.data(), CV_32F);
}
}
}
auto &output = outputs[0];
auto &input_B = weight_umat;
int M = static_cast<int>(helper.M),
N = static_cast<int>(helper.N),
@ -181,7 +297,7 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
UMat A, B, C, A_fp32, B_fp32, C_fp32;
for (int i = 0; i < batch; i++) {
A = input_A_2d.row(helper.A_rows[i]).reshape(1, trans_a ? K : M);
B = input_B_2d.row(helper.B_rows[i]).reshape(1, trans_b ? K : N);
B = input_B_2d.row(helper.B_rows[i]).reshape(1, trans_b ? N : K);
C = output_2d.row(helper.C_rows[i]).reshape(1, M);
if (trans_a) {
@ -200,7 +316,6 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
B_fp32 = B;
C_fp32 = C;
}
cv::gemm(A_fp32, B_fp32, 1.f, noArray(), 0.f, C_fp32);
if (use_half) {
A_fp32.convertTo(A, CV_16F);
@ -208,6 +323,12 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
C_fp32.convertTo(C, CV_16F);
}
}
// add bias
if (!bias_umat.empty()) {
cv::add(output, bias_umat, output);
}
return true;
}
#endif // HAVE_OPENCL
@ -216,18 +337,28 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE {
auto& input_A_node = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
std::shared_ptr<ov::Node> matmul;
std::shared_ptr<ov::Node> result;
ov::Output<ov::Node> bias;
if (nodes.size() == 2) {
if (blobs.empty()) {
auto &input_B_node = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
matmul = std::make_shared<ov::op::v0::MatMul>(input_A_node, input_B_node, trans_a, trans_b);
result = std::make_shared<ov::op::v0::MatMul>(input_A_node, input_B_node, trans_a, trans_b);
if (nodes.size() >= 3) {
bias = nodes[2].dynamicCast<InfEngineNgraphNode>()->node;
result = std::make_shared<ov::op::v1::Add>(result, bias);
}
} else {
auto input_B_shape = getShape<size_t>(blobs[0]);
auto input_B_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32, input_B_shape, blobs[0].data);
matmul = std::make_shared<ov::op::v0::MatMul>(input_A_node, input_B_node, trans_a, trans_b);
result = std::make_shared<ov::op::v0::MatMul>(input_A_node, input_B_node, trans_a, trans_b);
if ((nodes.size() + blobs.size()) >= 3) {
const auto bias_shape = shape(broadcast_bias);
bias = std::make_shared<ov::op::v0::Constant>(ov::element::f32, std::vector<size_t>(bias_shape.begin(), bias_shape.end()), broadcast_bias.data);
result = std::make_shared<ov::op::v1::Add>(result, bias);
}
}
return Ptr<BackendNode>(new InfEngineNgraphNode(matmul));
return Ptr<BackendNode>(new InfEngineNgraphNode(result));
}
#endif // HAVE_DNN_NGRAPH
@ -239,7 +370,7 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
const auto input_A_shape = shape(*input_A_wrapper->getMat());
const auto output_shape = shape(*output_wrapper->getMat());
if (output_shape.size() != 2) {
if ((inputs.size() + blobs.size()) >= 3 || output_shape.size() != 2) {
return Ptr<BackendNode>();
}
@ -259,11 +390,17 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
const std::vector<Ptr<BackendWrapper>>& inputs,
const std::vector<Ptr<BackendWrapper>>& outputs) override {
auto context = reinterpret_cast<csl::CSLContext*>(context_);
auto input_B = blobs.empty() ? Mat() : blobs[0];
auto input_B = Mat(), bias = Mat();
if (!blobs.empty()) {
input_B = blobs.front();
if (blobs.size() >= 2) {
bias = broadcast_bias;
}
}
CV_CheckFalse(helper.empty(), "DNN/MatMul/CUDA: MatMulHelper is not initialized");
return make_cuda_node<cuda4dnn::MatMulBroadcastOp>(preferableTarget, std::move(context->stream), std::move(context->cublas_handle), input_B, trans_a, trans_b, helper.A_offsets, helper.B_offsets, helper.C_offsets, helper.batch);
return make_cuda_node<cuda4dnn::MatMulBroadcastOp>(preferableTarget, std::move(context->stream), std::move(context->cublas_handle), input_B, bias, trans_a, trans_b, helper.A_offsets, helper.B_offsets, helper.C_offsets, helper.batch);
}
#endif // HAVE_CUDA
@ -275,7 +412,7 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
auto input_A_desc = input_A_wrapper->getTensorDesc();
auto input_A_node = nodes[0].dynamicCast<CannBackendNode>()->getOp();
auto op = std::make_shared<ge::op::BatchMatMul>(name);
auto op = std::make_shared<ge::op::BatchMatMulV2>(name);
// set attributes
op->set_attr_adj_x1(trans_a);
@ -292,11 +429,31 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
auto input_B_node = nodes[1].dynamicCast<CannBackendNode>()->getOp();
op->set_input_x2_by_name(*input_B_node, "y");
op->update_input_desc_x2(*input_B_desc);
if (inputs.size() >= 3) {
auto input_bias_wrapper = inputs[2].dynamicCast<CannBackendWrapper>();
auto input_bias_desc = input_bias_wrapper->getTensorDesc();
auto input_bias_node = nodes[2].dynamicCast<CannBackendNode>()->getOp();
op->set_input_bias_by_name(*input_bias_node, "y");
op->update_input_desc_bias(*input_bias_desc);
}
} else { // constant input B
auto B = blobs[0];
auto const_B_node = std::make_shared<CannConstOp>(B.data, B.type(), shape(B), cv::format("%s_B", name.c_str()));
op->set_input_x2_by_name(*(const_B_node->getOp()), "y");
op->update_input_desc_x2(*(const_B_node->getTensorDesc()));
if ((inputs.size() + blobs.size()) >= 3) { // does not support broadcast bias
auto bias_mat = blobs.back();
auto bias_shape = shape(bias_mat);
// reshape if 1d
if (real_ndims_C == 1 && bias_shape.front() != 1) {
bias_shape = std::vector<int>{bias_shape.front()};
}
auto const_bias_node = std::make_shared<CannConstOp>(bias_mat.data, bias_mat.type(), bias_shape, cv::format("%s_bias", name.c_str()));
op->set_input_bias_by_name(*(const_bias_node->getOp()), "y");
op->update_input_desc_bias(*(const_bias_node->getTensorDesc()));
}
}
// set outputs
@ -312,7 +469,10 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
float alpha;
float beta;
int real_ndims_C;
std::vector<float> packed_input_B;
Mat broadcast_bias;
FastGemmOpt opt;
MatMulHelper helper;

@ -242,6 +242,115 @@ class AdjustSliceAllOptionalInputsSubgraph : public Subgraph {
size_t num_inputs_;
};
/* Fusion for biased MatMul.
Graph before fusion: [Input] -> MatMul -> Add -> [Output]
Graph after fusion: [Input] -> MatMul -> [Output]
\
bias
*/
class BiasedMatmulSubgraph : public Subgraph {
public:
BiasedMatmulSubgraph() {
int input = addNodeToMatch("");
matmul_id = addNodeToMatch("MatMul", input, addNodeToMatch(""));
add_id = addNodeToMatch("Add", addNodeToMatch(""), matmul_id);
setFusedNode("MatMul", input);
}
virtual bool match(const Ptr<ImportGraphWrapper>& net, int nodeId,
std::vector<int>& matchedNodesIds) CV_OVERRIDE {
if (Subgraph::match(net, nodeId, matchedNodesIds)) {
auto onnx_net = net.dynamicCast<ONNXGraphWrapper>();
// get input weight from MatMul
{
// make sure that input A is not Constant
if (onnx_net->getInputInitializerId(matchedNodesIds[matmul_id], 0) >= 0) {
return false;
} else {
const Ptr<ImportNodeWrapper> node = net->getNode(matchedNodesIds[matmul_id]);
int constant_id = Subgraph::getInputNodeId(net, node, 0);
auto constant_node = net->getNode(constant_id);
if (constant_node->getType() == "Constant") {
return false;
}
}
bool is_weight_const = false;
int initializer_id = onnx_net->getInputInitializerId(matchedNodesIds[matmul_id], 1);
if (initializer_id != -1) { // Initializer
weight_name = onnx_net->getNameOfInitializer(initializer_id);
is_weight_const = true;
} else { // Constant layer
const Ptr<ImportNodeWrapper> node = net->getNode(matchedNodesIds[matmul_id]);
int constant_id = Subgraph::getInputNodeId(net, node, 1);
auto constant_node = net->getNode(constant_id);
if (constant_node->getType() == "Constant") {
weight_name = node->getInputName(1);
is_weight_const = true;
}
}
if (!is_weight_const) {
return false;
}
}
// get input bias from Add
{
bool is_bias_const = false;
int initializer_id = std::max(onnx_net->getInputInitializerId(matchedNodesIds[add_id], 0),
onnx_net->getInputInitializerId(matchedNodesIds[add_id], 1));
if (initializer_id != -1) {
bias_name = onnx_net->getNameOfInitializer(initializer_id);
is_bias_const = true;
} else { // Constant layer
const Ptr<ImportNodeWrapper> node = net->getNode(matchedNodesIds[add_id]);
int constant_id = Subgraph::getInputNodeId(net, node, 0);
auto constant_node = net->getNode(constant_id);
if (constant_node->getType() == "Constant") {
bias_name = node->getInputName(0);
is_bias_const = true;
} else {
constant_id = Subgraph::getInputNodeId(net, node, 1);
constant_node = net->getNode(constant_id);
if (constant_node->getType() == "Constant") {
bias_name = node->getInputName(1);
is_bias_const = true;
}
}
}
if (!is_bias_const) {
return false;
}
}
return true;
}
return false;
}
virtual void finalize(const Ptr<ImportGraphWrapper>& net,
const Ptr<ImportNodeWrapper>& fusedNode,
std::vector<Ptr<ImportNodeWrapper> >&) CV_OVERRIDE {
opencv_onnx::NodeProto* node = fusedNode.dynamicCast<ONNXNodeWrapper>()->node;
// add inputs
node->add_input(weight_name);
node->add_input(bias_name);
}
private:
int matmul_id, add_id;
std::string weight_name, bias_name;
};
/* The fusion for the multi-head attention from vision transformer.
Abbreviations:
@ -322,22 +431,21 @@ class AttentionSubGraph : public Subgraph {
AttentionSubGraph() {
int input = addNodeToMatch("");
int transpose = addNodeToMatch("Transpose", input); // tranpose does not make any differences to the accuracy here in this subgraph
att_matmul = addNodeToMatch("MatMul", transpose, addNodeToMatch(""));
att_add = addNodeToMatch("Add", addNodeToMatch(""), att_matmul);
att_matmul = addNodeToMatch("MatMul", transpose, addNodeToMatch(""), addNodeToMatch("")); // add is fused into matmul via BiasedMatMulSubgraph
// v_path
slice_v = addNodeToMatch("Slice", std::vector<int>{att_add, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
slice_v = addNodeToMatch("Slice", std::vector<int>{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
int reshape_v = addNodeToMatch("Reshape", slice_v, addNodeToMatch(""));
int transpose_v = addNodeToMatch("Transpose", reshape_v);
// q_path
slice_q = addNodeToMatch("Slice", std::vector<int>{att_add, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
slice_q = addNodeToMatch("Slice", std::vector<int>{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
reshape_q = addNodeToMatch("Reshape", slice_q, addNodeToMatch(""));
int transpose_q = addNodeToMatch("Transpose", reshape_q);
div_q = addNodeToMatch("Div", transpose_q, addNodeToMatch(""));
// k_path
slice_k = addNodeToMatch("Slice", std::vector<int>{att_add, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
slice_k = addNodeToMatch("Slice", std::vector<int>{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
int reshape_k = addNodeToMatch("Reshape", slice_k, addNodeToMatch(""));
int transpose_k = addNodeToMatch("Transpose", reshape_k);
@ -380,7 +488,7 @@ class AttentionSubGraph : public Subgraph {
// get names
weight_name = getInputName(net, matchedNodesIds[att_matmul], 1);
bias_name = getInputName(net, matchedNodesIds[att_add], 0);
bias_name = getInputName(net, matchedNodesIds[att_matmul], 2);
return true;
}
return false;
@ -414,7 +522,7 @@ class AttentionSubGraph : public Subgraph {
}
private:
int att_matmul, att_add;
int att_matmul;
int slice_q, slice_k, slice_v;
int reshape_q, div_q, last_reshape;
@ -436,20 +544,19 @@ class AttentionSingleHeadSubGraph : public Subgraph {
AttentionSingleHeadSubGraph() {
int input = addNodeToMatch("");
int transpose = addNodeToMatch("Transpose", input); // tranpose does not make any differences to the accuracy here in this subgraph
att_matmul = addNodeToMatch("MatMul", transpose, addNodeToMatch(""));
att_add = addNodeToMatch("Add", addNodeToMatch(""), att_matmul);
att_matmul = addNodeToMatch("MatMul", transpose, addNodeToMatch(""), addNodeToMatch("")); // add is fused into matmul via BiasedMatMulSubgraph
// v_path
slice_v = addNodeToMatch("Slice", std::vector<int>{att_add, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
slice_v = addNodeToMatch("Slice", std::vector<int>{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
int transpose_v = addNodeToMatch("Transpose", slice_v);
// q_path
slice_q = addNodeToMatch("Slice", std::vector<int>{att_add, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
slice_q = addNodeToMatch("Slice", std::vector<int>{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
int transpose_q = addNodeToMatch("Transpose", slice_q);
div_q = addNodeToMatch("Div", transpose_q, addNodeToMatch(""));
// k_path
slice_k = addNodeToMatch("Slice", std::vector<int>{att_add, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
slice_k = addNodeToMatch("Slice", std::vector<int>{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
int transpose_k = addNodeToMatch("Transpose", slice_k);
// qk
@ -491,7 +598,7 @@ class AttentionSingleHeadSubGraph : public Subgraph {
// get names
weight_name = getInputName(net, matchedNodesIds[att_matmul], 1);
bias_name = getInputName(net, matchedNodesIds[att_add], 0);
bias_name = getInputName(net, matchedNodesIds[att_matmul], 2);
return true;
}
return false;
@ -525,7 +632,7 @@ class AttentionSingleHeadSubGraph : public Subgraph {
}
protected:
int att_matmul, att_add;
int att_matmul;
int slice_q, slice_k, slice_v;
int div_q, last_reshape;
@ -1558,6 +1665,7 @@ public:
void simplifySubgraphs(opencv_onnx::GraphProto& net)
{
std::vector<Ptr<Subgraph> > subgraphs;
subgraphs.push_back(makePtr<BiasedMatmulSubgraph>());
subgraphs.push_back(makePtr<AdjustSliceAllOptionalInputsSubgraph>(3));
subgraphs.push_back(makePtr<AdjustSliceAllOptionalInputsSubgraph>(4));
subgraphs.push_back(makePtr<GeluSubGraph>());

@ -1961,7 +1961,8 @@ void ONNXImporter::parseGemm(LayerParams& layerParams, const opencv_onnx::NodePr
void ONNXImporter::parseMatMul(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_) {
auto node_proto = node_proto_;
CV_CheckEQ(node_proto.input_size(), 2, "ONNXImporter/MatMul: two inputs required");
CV_CheckGE(node_proto.input_size(), 2, "ONNXImporter/MatMul: two inputs required at least");
CV_CheckLE(node_proto.input_size(), 3, "ONNXImporter/MatMul: three inputs required at most");
for (int i = 0; i < node_proto.input_size(); i++) {
if (constBlobs.find(node_proto.input(i)) == constBlobs.end()) {
@ -1970,9 +1971,7 @@ void ONNXImporter::parseMatMul(LayerParams& layerParams, const opencv_onnx::Node
Mat blob = getBlob(node_proto, i);
if (i == 1) {
layerParams.blobs.push_back(blob);
} else {
if (i == 0) {
LayerParams const_params;
const_params.name = node_proto.input(i);
const_params.type = "Const";
@ -1983,6 +1982,12 @@ void ONNXImporter::parseMatMul(LayerParams& layerParams, const opencv_onnx::Node
addLayer(const_params, const_node_proto);
node_proto.set_input(i, const_params.name);
} else {
layerParams.blobs.push_back(blob);
}
if (i == 2 && constBlobsExtraInfo.find(node_proto.input(2)) != constBlobsExtraInfo.end()) {
layerParams.set("real_ndims_C", getBlobExtraInfo(node_proto, 2).real_ndims);
}
}

@ -143,4 +143,11 @@ TEST_F(Test_Graph_Simplifier, AttentionSubgraph) {
test("attention_single_head", "Attention");
}
TEST_F(Test_Graph_Simplifier, BiasedMatMulSubgraph) {
/* Test for 1 subgraphs
- BiasedMatMulSubgraph
*/
test("biased_matmul", "MatMul");
}
}}

@ -3090,6 +3090,12 @@ TEST_P(Test_ONNX_layers, LayerNormNoFusion) {
testONNXModels("layer_norm_no_fusion");
}
TEST_P(Test_ONNX_layers, MatMulAddFusion) {
double l1 = (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL) ? 0.0018 : default_l1;
double lInf = (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL) ? 0.011 : default_lInf;
testONNXModels("biased_matmul", npy, l1, lInf);
}
INSTANTIATE_TEST_CASE_P(/**/, Test_ONNX_nets, dnnBackendsAndTargets());
}} // namespace

Loading…
Cancel
Save