Merge pull request #25271 from fengyuentau:matmul_bias

Merge with https://github.com/opencv/opencv_extra/pull/1158 Todo: - [x] Fix Attention pattern recognition. - [x] Handle other backends. Benchmark: "VIT_B_32 OCV/CPU", M1, results in milliseconds. | Model | 4.x | This PR | | - | - | - | | VIT_B_32 OCV/CPU | 87.66 | **83.83** | ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
1 year ago · b758897c29
parent 9716bf95ae
commit b758897c29
6 changed files with 345 additions and 41 deletions
--- a/modules/dnn/src/cuda4dnn/primitives/matmul_broadcast.hpp
+++ b/modules/dnn/src/cuda4dnn/primitives/matmul_broadcast.hpp
@ -12,6 +12,8 @@
 #include "../csl/tensor.hpp"
 #include "../csl/tensor_ops.hpp"

+#include "../kernels/eltwise_ops.hpp" // for adding bias
+
 #include <opencv2/core.hpp>

 #include <utility>
@ -23,7 +25,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
    public:
        using wrapper_type = GetCUDABackendWrapperType<T>;

-        MatMulBroadcastOp(csl::Stream stream_, csl::cublas::Handle handle, const Mat &B, bool _transA, bool _transB,
+        MatMulBroadcastOp(csl::Stream stream_, csl::cublas::Handle handle, const Mat &B, const Mat &bias, bool _transA, bool _transB,
                 const std::vector<size_t> &A_offsets_, const std::vector<size_t> &B_offsets_, std::vector<size_t> &C_offsets_,
                 size_t batch_)
            : stream(std::move(stream_)), cublasHandle(std::move(handle)), A_offsets(A_offsets_), B_offsets(B_offsets_), C_offsets(C_offsets_), batch(batch_)
@ -33,6 +35,11 @@ namespace cv { namespace dnn { namespace cuda4dnn {
                csl::copyMatToTensor<T>(B, input_B_tensor, stream);
            }

+            if (!bias.empty()) {
+                bias_tensor = csl::makeTensorHeader<T>(bias);
+                csl::copyMatToTensor<T>(bias, bias_tensor, stream);
+            }
+
            transA = _transA;
            transB = _transB;
        }
@ -42,9 +49,6 @@ namespace cv { namespace dnn { namespace cuda4dnn {
            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
            csl::Workspace& workspace) override
        {
-            CV_Assert(((inputs.size() == 2 && input_B_tensor.empty()) ||
-                       (inputs.size() == 1 && !input_B_tensor.empty())) && outputs.size() == 1);
-
            auto input_A_wrapper = inputs[0].dynamicCast<wrapper_type>();
            auto input_A = input_A_wrapper->getView();

@ -60,12 +64,26 @@ namespace cv { namespace dnn { namespace cuda4dnn {
            auto output = output_wrapper->getSpan();

            csl::tensor_ops::gemmBatched<T>(cublasHandle, batch, 0.f, output, C_offsets, 1.f, transA, input_A, A_offsets, transB, input_B, B_offsets);
+
+            // add bias if exists
+            if (!bias_tensor.empty() || inputs.size() >= 3) {
+                csl::TensorView<T> bias;
+                if (bias_tensor.empty()) {
+                    auto bias_wrapper = inputs[2].dynamicCast<wrapper_type>();
+                    bias = bias_wrapper->getView();
+                } else {
+                    bias = csl::TensorView<T>(bias_tensor);
+                }
+
+                kernels::eltwise_sum_2<T>(stream, output, output, bias);
+            }
        }

    private:
        csl::Stream stream;
        csl::cublas::Handle cublasHandle;
        csl::Tensor<T> input_B_tensor;
+        csl::Tensor<T> bias_tensor;
        bool transA, transB;

        std::vector<size_t> A_offsets;
--- a/modules/dnn/src/layers/matmul_layer.cpp
+++ b/modules/dnn/src/layers/matmul_layer.cpp
@ -26,6 +26,10 @@ using namespace cv::dnn::cuda4dnn;
 namespace cv { namespace dnn {

 class MatMulLayerImpl CV_FINAL : public MatMulLayer {
+#ifdef HAVE_OPENCL
+    UMat weight_umat, bias_umat;
+#endif
+
 public:
    MatMulLayerImpl(const LayerParams& params) {
        setParamsFrom(params);
@ -34,6 +38,8 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
        trans_b = params.get<bool>("transB", false);
        alpha = params.get<float>("alpha", 1.f);
        beta = params.get<float>("beta", 1.f);
+
+        real_ndims_C = params.get<int>("real_ndims_C", -1);
    }

    virtual bool supportBackend(int backendId) CV_OVERRIDE {
@ -48,8 +54,9 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
                                 const int requiredOutputs,
                                 std::vector<MatShape> &outputs,
                                 std::vector<MatShape> &internals) const CV_OVERRIDE {
-        CV_CheckGE(inputs.size(), static_cast<size_t>(1), "DNN/MatMul: one varible input at least");
-        CV_CheckLE(inputs.size(), static_cast<size_t>(2), "DNN/MatMul: two variable inputs at most");
+        int num_inputs = inputs.size() + blobs.size();
+        CV_CheckGE(num_inputs, 2, "DNN/MatMul: two inputs at least");
+        CV_CheckLE(num_inputs, 3, "DNN/MatMul: three inputs at most");

        const auto shape_A = inputs[0], shape_B = blobs.empty() ? inputs[1] : shape(blobs[0]);
        CV_CheckGE(shape_A.size(), static_cast<size_t>(2), "DNN/MatMul: invalid shape of input A");
@ -64,7 +71,7 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
        int K_B = trans_b ? nB : mB;
        CV_CheckEQ(K_A, K_B, "DNN/MatMul: invalid dimension K");

-        // Check legal broadcast. It is legal for sure if A and B are 2d, or one of them is 2d.
+        // Check if inputs are broadcastable.
        MatShape common_shape;
        if (shape_A.size() != 2 || shape_B.size() != 2) {
            const auto &shape_more_dims = shape_A.size() > shape_B.size() ? shape_A : shape_B;
@ -89,6 +96,24 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
            common_shape[1] = N;
        }

+        // Check if bias is broadcastable
+        if (num_inputs == 3) {
+            const auto shape_C = blobs.empty() ? inputs.back() : shape(blobs.back());
+            if (real_ndims_C == 1) { // (1) or (N)
+                CV_Check(shape_C[0], shape_C[0] == 1 || shape_C[0] == N, "DNN/MatMul: invalid dimension of C");
+            } else if (real_ndims_C >= 2) {
+                const auto &shape_large = common_shape.size() > shape_C.size() ? common_shape : shape_C;
+                const auto &shape_small = common_shape.size() > shape_C.size() ? shape_C : common_shape;
+                size_t diff_dims = shape_large.size() - shape_small.size();
+                for (size_t i = 0; i < shape_small.size(); i++) {
+                    const auto dl = shape_small[i], dm = shape_large[i + diff_dims];
+                    if (dl != 1 && dm != 1 && dl != dm) {
+                        CV_Error(Error::StsBadSize, "DNN/MatMul: invalid shape of C");
+                    }
+                }
+            }
+        }
+
        outputs.assign(1, common_shape);
        return false;
    }
@ -109,6 +134,44 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
            fastGemmPackB(blobs[0], packed_input_B, trans_b, opt);
            helper.updatePackedBOffsets(packed_input_B.size());
        }
+
+        // broadcast bias if needed
+        if ((inputs.size() + blobs.size()) >= 3 && blobs.size() >= 2) {
+            const auto bias_mat = blobs.back();
+            const auto bias_shape = shape(bias_mat);
+            bool is_broadcast_needed = real_ndims_C == 0 || real_ndims_C == 1 || (total(bias_shape) != total(C_shape) || bias_shape.size() != C_shape.size());
+
+            if (is_broadcast_needed) {
+                broadcast_bias = Mat(C_shape, CV_32F);
+                auto *broadcast_bias_ptr = broadcast_bias.ptr<float>();
+
+                const auto *bias = bias_mat.ptr<const float>();
+                if (bias_mat.total() == 1) { // [], [1], [1, ...]
+                    float b = (*bias) * beta;
+                    for (size_t i = 0; i < broadcast_bias.total(); i++) {
+                        broadcast_bias_ptr[i] = b;
+                    }
+                } else if (real_ndims_C == 1) { // [n]
+                    size_t inner_size = C_shape.back(),
+                        loops = total(C_shape) / inner_size;
+                    for (size_t i = 0; i < loops; i++) {
+                        size_t step = i * inner_size;
+                        for (size_t j = 0; j < inner_size; j++) {
+                            broadcast_bias_ptr[step + j] = beta * bias[j];
+                        }
+                    }
+                } else {
+                    broadcast(bias_mat, C_shape, broadcast_bias);
+                }
+            } else {
+                broadcast_bias = blobs.back();
+            }
+        }
+
+#ifdef HAVE_OPENCL
+        weight_umat.release();
+        bias_umat.release();
+#endif
    }

    // works like Y = numpy.matmul(A, B)
@ -134,7 +197,38 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {

        const auto *a = A.ptr<const float>();
        auto *y = Y.ptr<float>();
-        std::memset(y, 0, Y.total() * sizeof(float));
+        // add bias if existed
+        if ((inputs.size() + blobs.size()) >= 3) {
+            const auto &shape_Y = shape(Y);
+            if (blobs.empty()) { // bias from input
+                const auto &bias_mat = inputs.back();
+                const auto *bias = bias_mat.ptr<const float>();
+                if (bias_mat.total() == 1) { // [], [1], [1, ...]
+                    float b = (*bias) * beta;
+                    for (size_t i = 0; i < Y.total(); i++) {
+                        y[i] = b;
+                    }
+                } else if (real_ndims_C == 1) { // [n]
+                    const size_t inner_size = shape_Y.back(),
+                                 batches = total(Y) / inner_size;
+                    parallel_for_(Range(0, batches), [&] (const Range &r) {
+                        for (int i = r.start; i < r.end; i++) {
+                            const size_t output_offset = i * inner_size;
+                            for (size_t j = 0; j < inner_size; j++) {
+                                y[output_offset + j] = beta * bias[j];
+                            }
+                        }
+                    }, double(batches * inner_size * (1 / 1024.0)));
+                } else {
+                    broadcast(bias_mat, shape_Y, Y);
+                }
+            } else { // bias from constant
+                const auto *bias = broadcast_bias.ptr<const float>();
+                std::memcpy(y, bias, total(shape_Y) * sizeof(float));
+            }
+        } else {
+            std::memset(y, 0, Y.total() * sizeof(float));
+        }

        if (blobs.empty()) {
            const auto &B = inputs[1];
@ -158,14 +252,36 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
        inputs_arr.getUMatVector(inputs);
        outputs_arr.getUMatVector(outputs);

+        // does not support bias as input
+        if (inputs.size() >= 3) {
+            return false;
+        }
+
        const auto &input_A = inputs[0];
-        UMat input_B;
+        auto &output = outputs[0];
+        const auto output_shape = shape(output);
+
        if (blobs.empty()) {
-            input_B = inputs[1];
+            weight_umat = inputs[1];
+            if ((inputs.size() + blobs.size() >= 3)) {
+                bias_umat = UMat::zeros(output_shape.size(), output_shape.data(), CV_32F);
+            }
        } else {
-            blobs[0].copyTo(input_B);
+            if (weight_umat.empty()) {
+                blobs.front().copyTo(weight_umat);
+            }
+            if ((inputs.size() + blobs.size() >= 3)) {
+                if (bias_umat.empty()) {
+                    broadcast_bias.copyTo(bias_umat);
+                }
+            } else {
+                if (bias_umat.empty()) {
+                    bias_umat = UMat::zeros(output_shape.size(), output_shape.data(), CV_32F);
+                }
+            }
        }
-        auto &output = outputs[0];
+
+        auto &input_B = weight_umat;

        int M = static_cast<int>(helper.M),
            N = static_cast<int>(helper.N),
@ -181,7 +297,7 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
        UMat A, B, C, A_fp32, B_fp32, C_fp32;
        for (int i = 0; i < batch; i++) {
            A = input_A_2d.row(helper.A_rows[i]).reshape(1, trans_a ? K : M);
-            B = input_B_2d.row(helper.B_rows[i]).reshape(1, trans_b ? K : N);
+            B = input_B_2d.row(helper.B_rows[i]).reshape(1, trans_b ? N : K);
            C = output_2d.row(helper.C_rows[i]).reshape(1, M);

            if (trans_a) {
@ -200,7 +316,6 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
                B_fp32 = B;
                C_fp32 = C;
            }
-
            cv::gemm(A_fp32, B_fp32, 1.f, noArray(), 0.f, C_fp32);
            if (use_half) {
                A_fp32.convertTo(A, CV_16F);
@ -208,6 +323,12 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
                C_fp32.convertTo(C, CV_16F);
            }
        }
+
+        // add bias
+        if (!bias_umat.empty()) {
+            cv::add(output, bias_umat, output);
+        }
+
        return true;
    }
 #endif // HAVE_OPENCL
@ -216,18 +337,28 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE {
        auto& input_A_node = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
-        std::shared_ptr<ov::Node> matmul;
+        std::shared_ptr<ov::Node> result;
+        ov::Output<ov::Node> bias;

-        if (nodes.size() == 2) {
+        if (blobs.empty()) {
            auto &input_B_node = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
-            matmul = std::make_shared<ov::op::v0::MatMul>(input_A_node, input_B_node, trans_a, trans_b);
+            result = std::make_shared<ov::op::v0::MatMul>(input_A_node, input_B_node, trans_a, trans_b);
+            if (nodes.size() >= 3) {
+                bias = nodes[2].dynamicCast<InfEngineNgraphNode>()->node;
+                result = std::make_shared<ov::op::v1::Add>(result, bias);
+            }
        } else {
            auto input_B_shape = getShape<size_t>(blobs[0]);
            auto input_B_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32, input_B_shape, blobs[0].data);
-            matmul = std::make_shared<ov::op::v0::MatMul>(input_A_node, input_B_node, trans_a, trans_b);
+            result = std::make_shared<ov::op::v0::MatMul>(input_A_node, input_B_node, trans_a, trans_b);
+            if ((nodes.size() + blobs.size()) >= 3) {
+                const auto bias_shape = shape(broadcast_bias);
+                bias = std::make_shared<ov::op::v0::Constant>(ov::element::f32, std::vector<size_t>(bias_shape.begin(), bias_shape.end()), broadcast_bias.data);
+                result = std::make_shared<ov::op::v1::Add>(result, bias);
+            }
        }

-        return Ptr<BackendNode>(new InfEngineNgraphNode(matmul));
+        return Ptr<BackendNode>(new InfEngineNgraphNode(result));
    }
 #endif // HAVE_DNN_NGRAPH

@ -239,7 +370,7 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {

        const auto input_A_shape = shape(*input_A_wrapper->getMat());
        const auto output_shape = shape(*output_wrapper->getMat());
-        if (output_shape.size() != 2) {
+        if ((inputs.size() + blobs.size()) >= 3 || output_shape.size() != 2) {
            return Ptr<BackendNode>();
        }

@ -259,11 +390,17 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
                              const std::vector<Ptr<BackendWrapper>>& inputs,
                              const std::vector<Ptr<BackendWrapper>>& outputs) override {
        auto context = reinterpret_cast<csl::CSLContext*>(context_);
-        auto input_B = blobs.empty() ? Mat() : blobs[0];
+        auto input_B = Mat(), bias = Mat();
+        if (!blobs.empty()) {
+            input_B = blobs.front();
+            if (blobs.size() >= 2) {
+                bias = broadcast_bias;
+            }
+        }

        CV_CheckFalse(helper.empty(), "DNN/MatMul/CUDA: MatMulHelper is not initialized");

-        return make_cuda_node<cuda4dnn::MatMulBroadcastOp>(preferableTarget, std::move(context->stream), std::move(context->cublas_handle), input_B, trans_a, trans_b, helper.A_offsets, helper.B_offsets, helper.C_offsets, helper.batch);
+        return make_cuda_node<cuda4dnn::MatMulBroadcastOp>(preferableTarget, std::move(context->stream), std::move(context->cublas_handle), input_B, bias, trans_a, trans_b, helper.A_offsets, helper.B_offsets, helper.C_offsets, helper.batch);
    }
 #endif // HAVE_CUDA

@ -275,7 +412,7 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
        auto input_A_desc = input_A_wrapper->getTensorDesc();
        auto input_A_node = nodes[0].dynamicCast<CannBackendNode>()->getOp();

-        auto op = std::make_shared<ge::op::BatchMatMul>(name);
+        auto op = std::make_shared<ge::op::BatchMatMulV2>(name);

        // set attributes
        op->set_attr_adj_x1(trans_a);
@ -292,11 +429,31 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
            auto input_B_node = nodes[1].dynamicCast<CannBackendNode>()->getOp();
            op->set_input_x2_by_name(*input_B_node, "y");
            op->update_input_desc_x2(*input_B_desc);
+            if (inputs.size() >= 3) {
+                auto input_bias_wrapper = inputs[2].dynamicCast<CannBackendWrapper>();
+                auto input_bias_desc = input_bias_wrapper->getTensorDesc();
+                auto input_bias_node = nodes[2].dynamicCast<CannBackendNode>()->getOp();
+                op->set_input_bias_by_name(*input_bias_node, "y");
+                op->update_input_desc_bias(*input_bias_desc);
+            }
        } else { // constant input B
            auto B = blobs[0];
            auto const_B_node = std::make_shared<CannConstOp>(B.data, B.type(), shape(B), cv::format("%s_B", name.c_str()));
            op->set_input_x2_by_name(*(const_B_node->getOp()), "y");
            op->update_input_desc_x2(*(const_B_node->getTensorDesc()));
+            if ((inputs.size() + blobs.size()) >= 3) { // does not support broadcast bias
+                auto bias_mat = blobs.back();
+                auto bias_shape = shape(bias_mat);
+
+                // reshape if 1d
+                if (real_ndims_C == 1 && bias_shape.front() != 1) {
+                    bias_shape = std::vector<int>{bias_shape.front()};
+                }
+
+                auto const_bias_node = std::make_shared<CannConstOp>(bias_mat.data, bias_mat.type(), bias_shape, cv::format("%s_bias", name.c_str()));
+                op->set_input_bias_by_name(*(const_bias_node->getOp()), "y");
+                op->update_input_desc_bias(*(const_bias_node->getTensorDesc()));
+            }
        }

        // set outputs
@ -312,7 +469,10 @@ class MatMulLayerImpl CV_FINAL : public MatMulLayer {
    float alpha;
    float beta;

+    int real_ndims_C;
+
    std::vector<float> packed_input_B;
+    Mat broadcast_bias;

    FastGemmOpt opt;
    MatMulHelper helper;
--- a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
+++ b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
@ -242,6 +242,115 @@ class AdjustSliceAllOptionalInputsSubgraph : public Subgraph {
    size_t num_inputs_;
 };

+/* Fusion for biased MatMul.
+
+   Graph before fusion: [Input] -> MatMul -> Add -> [Output]
+
+   Graph after fusion:  [Input] -> MatMul -> [Output]
+                                     \
+                                     bias
+*/
+
+class BiasedMatmulSubgraph : public Subgraph {
+ public:
+    BiasedMatmulSubgraph() {
+        int input = addNodeToMatch("");
+        matmul_id = addNodeToMatch("MatMul", input, addNodeToMatch(""));
+        add_id = addNodeToMatch("Add", addNodeToMatch(""), matmul_id);
+
+        setFusedNode("MatMul", input);
+    }
+
+    virtual bool match(const Ptr<ImportGraphWrapper>& net, int nodeId,
+                       std::vector<int>& matchedNodesIds) CV_OVERRIDE {
+        if (Subgraph::match(net, nodeId, matchedNodesIds)) {
+            auto onnx_net = net.dynamicCast<ONNXGraphWrapper>();
+
+            // get input weight from MatMul
+            {
+                // make sure that input A is not Constant
+                if (onnx_net->getInputInitializerId(matchedNodesIds[matmul_id], 0) >= 0) {
+                    return false;
+                } else {
+                    const Ptr<ImportNodeWrapper> node = net->getNode(matchedNodesIds[matmul_id]);
+
+                    int constant_id = Subgraph::getInputNodeId(net, node, 0);
+                    auto constant_node = net->getNode(constant_id);
+                    if (constant_node->getType() == "Constant") {
+                        return false;
+                    }
+                }
+
+                bool is_weight_const = false;
+                int initializer_id = onnx_net->getInputInitializerId(matchedNodesIds[matmul_id], 1);
+                if (initializer_id != -1) { // Initializer
+                    weight_name = onnx_net->getNameOfInitializer(initializer_id);
+                    is_weight_const = true;
+                } else { // Constant layer
+                    const Ptr<ImportNodeWrapper> node = net->getNode(matchedNodesIds[matmul_id]);
+
+                    int constant_id = Subgraph::getInputNodeId(net, node, 1);
+                    auto constant_node = net->getNode(constant_id);
+                    if (constant_node->getType() == "Constant") {
+                        weight_name = node->getInputName(1);
+                        is_weight_const = true;
+                    }
+                }
+
+                if (!is_weight_const) {
+                    return false;
+                }
+            }
+
+            // get input bias from Add
+            {
+                bool is_bias_const = false;
+                int initializer_id = std::max(onnx_net->getInputInitializerId(matchedNodesIds[add_id], 0),
+                                              onnx_net->getInputInitializerId(matchedNodesIds[add_id], 1));
+                if (initializer_id != -1) {
+                    bias_name = onnx_net->getNameOfInitializer(initializer_id);
+                    is_bias_const = true;
+                } else { // Constant layer
+                    const Ptr<ImportNodeWrapper> node = net->getNode(matchedNodesIds[add_id]);
+
+                    int constant_id = Subgraph::getInputNodeId(net, node, 0);
+                    auto constant_node = net->getNode(constant_id);
+                    if (constant_node->getType() == "Constant") {
+                        bias_name = node->getInputName(0);
+                        is_bias_const = true;
+                    } else {
+                        constant_id = Subgraph::getInputNodeId(net, node, 1);
+                        constant_node = net->getNode(constant_id);
+                        if (constant_node->getType() == "Constant") {
+                            bias_name = node->getInputName(1);
+                            is_bias_const = true;
+                        }
+                    }
+                }
+                if (!is_bias_const) {
+                    return false;
+                }
+            }
+
+            return true;
+        }
+        return false;
+    }
+
+    virtual void finalize(const Ptr<ImportGraphWrapper>& net,
+                          const Ptr<ImportNodeWrapper>& fusedNode,
+                          std::vector<Ptr<ImportNodeWrapper> >&) CV_OVERRIDE {
+        opencv_onnx::NodeProto* node = fusedNode.dynamicCast<ONNXNodeWrapper>()->node;
+        // add inputs
+        node->add_input(weight_name);
+        node->add_input(bias_name);
+    }
+
+ private:
+    int matmul_id, add_id;
+    std::string weight_name, bias_name;
+};
+
 /*  The fusion for the multi-head attention from vision transformer.

    Abbreviations:
@ -322,22 +431,21 @@ class AttentionSubGraph : public Subgraph {
    AttentionSubGraph() {
        int input = addNodeToMatch("");
        int transpose = addNodeToMatch("Transpose", input); // tranpose does not make any differences to the accuracy here in this subgraph
-        att_matmul = addNodeToMatch("MatMul", transpose, addNodeToMatch(""));
-        att_add = addNodeToMatch("Add", addNodeToMatch(""), att_matmul);
+        att_matmul = addNodeToMatch("MatMul", transpose, addNodeToMatch(""), addNodeToMatch("")); // add is fused into matmul via BiasedMatMulSubgraph

        // v_path
-        slice_v = addNodeToMatch("Slice", std::vector<int>{att_add, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
+        slice_v = addNodeToMatch("Slice", std::vector<int>{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
        int reshape_v = addNodeToMatch("Reshape", slice_v, addNodeToMatch(""));
        int transpose_v = addNodeToMatch("Transpose", reshape_v);

        // q_path
-        slice_q = addNodeToMatch("Slice", std::vector<int>{att_add, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
+        slice_q = addNodeToMatch("Slice", std::vector<int>{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
        reshape_q = addNodeToMatch("Reshape", slice_q, addNodeToMatch(""));
        int transpose_q = addNodeToMatch("Transpose", reshape_q);
        div_q = addNodeToMatch("Div", transpose_q, addNodeToMatch(""));

        // k_path
-        slice_k = addNodeToMatch("Slice", std::vector<int>{att_add, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
+        slice_k = addNodeToMatch("Slice", std::vector<int>{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
        int reshape_k = addNodeToMatch("Reshape", slice_k, addNodeToMatch(""));
        int transpose_k = addNodeToMatch("Transpose", reshape_k);

@ -380,7 +488,7 @@ class AttentionSubGraph : public Subgraph {

            // get names
            weight_name = getInputName(net, matchedNodesIds[att_matmul], 1);
-            bias_name = getInputName(net, matchedNodesIds[att_add], 0);
+            bias_name = getInputName(net, matchedNodesIds[att_matmul], 2);
            return true;
        }
        return false;
@ -414,7 +522,7 @@ class AttentionSubGraph : public Subgraph {
    }

 private:
-    int att_matmul, att_add;
+    int att_matmul;
    int slice_q, slice_k, slice_v;
    int reshape_q, div_q, last_reshape;

@ -436,20 +544,19 @@ class AttentionSingleHeadSubGraph : public Subgraph {
    AttentionSingleHeadSubGraph() {
        int input = addNodeToMatch("");
        int transpose = addNodeToMatch("Transpose", input); // tranpose does not make any differences to the accuracy here in this subgraph
-        att_matmul = addNodeToMatch("MatMul", transpose, addNodeToMatch(""));
-        att_add = addNodeToMatch("Add", addNodeToMatch(""), att_matmul);
+        att_matmul = addNodeToMatch("MatMul", transpose, addNodeToMatch(""), addNodeToMatch("")); // add is fused into matmul via BiasedMatMulSubgraph

        // v_path
-        slice_v = addNodeToMatch("Slice", std::vector<int>{att_add, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
+        slice_v = addNodeToMatch("Slice", std::vector<int>{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
        int transpose_v = addNodeToMatch("Transpose", slice_v);

        // q_path
-        slice_q = addNodeToMatch("Slice", std::vector<int>{att_add, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
+        slice_q = addNodeToMatch("Slice", std::vector<int>{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
        int transpose_q = addNodeToMatch("Transpose", slice_q);
        div_q = addNodeToMatch("Div", transpose_q, addNodeToMatch(""));

        // k_path
-        slice_k = addNodeToMatch("Slice", std::vector<int>{att_add, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
+        slice_k = addNodeToMatch("Slice", std::vector<int>{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
        int transpose_k = addNodeToMatch("Transpose", slice_k);

        // qk
@ -491,7 +598,7 @@ class AttentionSingleHeadSubGraph : public Subgraph {

            // get names
            weight_name = getInputName(net, matchedNodesIds[att_matmul], 1);
-            bias_name = getInputName(net, matchedNodesIds[att_add], 0);
+            bias_name = getInputName(net, matchedNodesIds[att_matmul], 2);
            return true;
        }
        return false;
@ -525,7 +632,7 @@ class AttentionSingleHeadSubGraph : public Subgraph {
    }

 protected:
-    int att_matmul, att_add;
+    int att_matmul;
    int slice_q, slice_k, slice_v;
    int div_q, last_reshape;

@ -1558,6 +1665,7 @@ public:
 void simplifySubgraphs(opencv_onnx::GraphProto& net)
 {
    std::vector<Ptr<Subgraph> > subgraphs;
+    subgraphs.push_back(makePtr<BiasedMatmulSubgraph>());
    subgraphs.push_back(makePtr<AdjustSliceAllOptionalInputsSubgraph>(3));
    subgraphs.push_back(makePtr<AdjustSliceAllOptionalInputsSubgraph>(4));
    subgraphs.push_back(makePtr<GeluSubGraph>());
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@ -1961,7 +1961,8 @@ void ONNXImporter::parseGemm(LayerParams& layerParams, const opencv_onnx::NodePr

 void ONNXImporter::parseMatMul(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_) {
    auto node_proto = node_proto_;
-    CV_CheckEQ(node_proto.input_size(), 2, "ONNXImporter/MatMul: two inputs required");
+    CV_CheckGE(node_proto.input_size(), 2, "ONNXImporter/MatMul: two inputs required at least");
+    CV_CheckLE(node_proto.input_size(), 3, "ONNXImporter/MatMul: three inputs required at most");

    for (int i = 0; i < node_proto.input_size(); i++) {
        if (constBlobs.find(node_proto.input(i)) == constBlobs.end()) {
@ -1970,9 +1971,7 @@ void ONNXImporter::parseMatMul(LayerParams& layerParams, const opencv_onnx::Node

        Mat blob = getBlob(node_proto, i);

-        if (i == 1) {
-            layerParams.blobs.push_back(blob);
-        } else {
+        if (i == 0) {
            LayerParams const_params;
            const_params.name = node_proto.input(i);
            const_params.type = "Const";
@ -1983,6 +1982,12 @@ void ONNXImporter::parseMatMul(LayerParams& layerParams, const opencv_onnx::Node
            addLayer(const_params, const_node_proto);

            node_proto.set_input(i, const_params.name);
+        } else {
+            layerParams.blobs.push_back(blob);
+        }
+
+        if (i == 2 && constBlobsExtraInfo.find(node_proto.input(2)) != constBlobsExtraInfo.end()) {
+            layerParams.set("real_ndims_C", getBlobExtraInfo(node_proto, 2).real_ndims);
        }
    }

--- a/modules/dnn/test/test_graph_simplifier.cpp
+++ b/modules/dnn/test/test_graph_simplifier.cpp
@ -143,4 +143,11 @@ TEST_F(Test_Graph_Simplifier, AttentionSubgraph) {
    test("attention_single_head", "Attention");
 }

+TEST_F(Test_Graph_Simplifier, BiasedMatMulSubgraph) {
+    /* Test for 1 subgraphs
+        - BiasedMatMulSubgraph
+    */
+    test("biased_matmul", "MatMul");
+}
+
 }}
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@ -3090,6 +3090,12 @@ TEST_P(Test_ONNX_layers, LayerNormNoFusion) {
    testONNXModels("layer_norm_no_fusion");
 }

+TEST_P(Test_ONNX_layers, MatMulAddFusion) {
+    double l1 = (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL) ? 0.0018 : default_l1;
+    double lInf = (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL) ? 0.011 : default_lInf;
+    testONNXModels("biased_matmul", npy, l1, lInf);
+}
+
 INSTANTIATE_TEST_CASE_P(/**/, Test_ONNX_nets, dnnBackendsAndTargets());

 }} // namespace