|
|
|
@ -12,6 +12,8 @@ |
|
|
|
|
#include "../csl/tensor.hpp" |
|
|
|
|
#include "../csl/tensor_ops.hpp" |
|
|
|
|
|
|
|
|
|
#include "../kernels/scale_shift.hpp" |
|
|
|
|
|
|
|
|
|
#include <opencv2/core.hpp> |
|
|
|
|
|
|
|
|
|
#include <utility> |
|
|
|
@ -23,7 +25,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { |
|
|
|
|
public: |
|
|
|
|
using wrapper_type = GetCUDABackendWrapperType<T>; |
|
|
|
|
|
|
|
|
|
MatMulOp(csl::Stream stream_, csl::cublas::Handle handle, const Mat& constInp) |
|
|
|
|
MatMulOp(csl::Stream stream_, csl::cublas::Handle handle, const Mat& constInp, const Mat& bias, bool _transA, bool _transB) |
|
|
|
|
: stream(std::move(stream_)), cublasHandle(std::move(handle)) |
|
|
|
|
{ |
|
|
|
|
if (!constInp.empty()) |
|
|
|
@ -31,6 +33,15 @@ namespace cv { namespace dnn { namespace cuda4dnn { |
|
|
|
|
constTensor = csl::makeTensorHeader<T>(constInp); |
|
|
|
|
csl::copyMatToTensor<T>(constInp, constTensor, stream); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (!bias.empty()) |
|
|
|
|
{ |
|
|
|
|
biasTensor = csl::makeTensorHeader<T>(bias); |
|
|
|
|
csl::copyMatToTensor<T>(bias, biasTensor, stream); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
transA = _transA; |
|
|
|
|
transB = _transB; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
void forward( |
|
|
|
@ -69,50 +80,72 @@ namespace cv { namespace dnn { namespace cuda4dnn { |
|
|
|
|
CV_Assert(input2.get_axis_size(i) == size); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
auto m = input1.get_axis_size(-2); |
|
|
|
|
auto n = input1.get_axis_size(-1); |
|
|
|
|
auto b = input1.size() / m / n; |
|
|
|
|
int k; |
|
|
|
|
if (constTensor.empty()) |
|
|
|
|
int m1, n1, b1, m2, n2, b2; |
|
|
|
|
if (transA) |
|
|
|
|
{ |
|
|
|
|
m1 = input1.get_axis_size(-1); |
|
|
|
|
n1 = input1.get_axis_size(-2); |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
{ |
|
|
|
|
m1 = input1.get_axis_size(-2); |
|
|
|
|
n1 = input1.get_axis_size(-1); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (transB) |
|
|
|
|
{ |
|
|
|
|
k = input2.get_axis_size(-1); |
|
|
|
|
CV_Assert(input2.get_axis_size(-2) == n); |
|
|
|
|
m2 = input2.get_axis_size(-1); |
|
|
|
|
n2 = input2.get_axis_size(-2); |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
{ |
|
|
|
|
k = input2.get_axis_size(-2); |
|
|
|
|
CV_Assert(input2.get_axis_size(-1) == n); |
|
|
|
|
m2 = input2.get_axis_size(-2); |
|
|
|
|
n2 = input2.get_axis_size(-1); |
|
|
|
|
} |
|
|
|
|
CV_Assert(output.get_axis_size(-2) == m); |
|
|
|
|
CV_Assert(output.get_axis_size(-1) == k); |
|
|
|
|
|
|
|
|
|
b1 = input1.size() / m1 / n1; |
|
|
|
|
b2 = input2.size() / m2 / n2; |
|
|
|
|
CV_Assert(b1 == b2); |
|
|
|
|
CV_Assert(n1 == m2); |
|
|
|
|
CV_Assert(output.get_axis_size(-2) == m1); |
|
|
|
|
CV_Assert(output.get_axis_size(-1) == n2); |
|
|
|
|
|
|
|
|
|
if (get_effective_rank(output) <= 2) |
|
|
|
|
{ |
|
|
|
|
CV_Assert(b == 1); |
|
|
|
|
CV_Assert(b2 == 1); |
|
|
|
|
CV_Assert(get_effective_rank(input1) <= 2); |
|
|
|
|
CV_Assert(get_effective_rank(input2) <= 2); |
|
|
|
|
csl::tensor_ops::gemm<T>(cublasHandle, 0.0, output, 1.0, false, input1, !constTensor.empty(), input2); |
|
|
|
|
csl::tensor_ops::gemm<T>(cublasHandle, 0.0, output, 1.0, transA, input1, transB, input2); |
|
|
|
|
// used for GEMM
|
|
|
|
|
if (!biasTensor.empty()) |
|
|
|
|
kernels::biasN<T>(stream, output, output, 1, biasTensor); |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
{ |
|
|
|
|
CV_Assert(rank >= 3); |
|
|
|
|
input1.reshape(b, m, n); |
|
|
|
|
if (constTensor.empty()) |
|
|
|
|
input2.reshape(b, n, k); |
|
|
|
|
if (transA) |
|
|
|
|
input1.reshape(b1, n1, m1); |
|
|
|
|
else |
|
|
|
|
input1.reshape(b1, m1, n1); |
|
|
|
|
|
|
|
|
|
if (transB) |
|
|
|
|
input2.reshape(b2, n2, m2); |
|
|
|
|
else |
|
|
|
|
input2.reshape(b, k, n); |
|
|
|
|
output.reshape(b, m, k); |
|
|
|
|
input2.reshape(b2, m2, n2); |
|
|
|
|
|
|
|
|
|
output.reshape(b1, m1, n2); |
|
|
|
|
input1.squeeze_to(3); |
|
|
|
|
input2.squeeze_to(3); |
|
|
|
|
output.squeeze_to(3); |
|
|
|
|
csl::tensor_ops::gemmStridedBatched<T>(cublasHandle, 0.0, output, 1.0, false, input1, !constTensor.empty(), input2); |
|
|
|
|
csl::tensor_ops::gemmStridedBatched<T>(cublasHandle, 0.0, output, 1.0, transA, input1, transB, input2); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
private: |
|
|
|
|
csl::Stream stream; |
|
|
|
|
csl::cublas::Handle cublasHandle; |
|
|
|
|
csl::Tensor<T> constTensor; |
|
|
|
|
csl::Tensor<T> constTensor, biasTensor; |
|
|
|
|
bool transA, transB; |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
}}} /* namespace cv::dnn::cuda4dnn */ |
|
|
|
|