Merge pull request #24768 from Haosonn:pre-pr-2

Vulkan backend for NaryEltwiseLayer in DNN module #24768

We improve Vulkan backend for ``NaryEltwiseLayer`` in DNN module by:

- add a basic framework for Vulkan backend in ``NaryEltwiseLayer``
- add a compute shader for binary forwarding (an imitation of what has been done in native OpenCV backend including broadcasting and eltwise-operation)
- typo fixed:
  - Wrong info output in ``context.cpp``

Currently, our implementation (or all layers supporting Vulkan backend) runs pretty slow on discrete GPUs basically due to IO cost in function ``copyToHost``, and we are going to fix that by

- find out the best ``VkMemoryProperty`` for various discrete GPUs

- prevent ``copyToHost`` in middle layers during forwarding, (i.e keep data in GPU memory)
### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake

Co-authored-by: IskXCr <IskXCr@outlook.com>
pull/24927/head
Haosonn 12 months ago committed by GitHub
parent 03994163b5
commit 87f749277d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 3
      modules/dnn/perf/perf_layer.cpp
  2. 410
      modules/dnn/src/layers/nary_eltwise_layers.cpp
  3. 87
      modules/dnn/src/vkcom/include/op_naryeltwise.hpp
  4. 1
      modules/dnn/src/vkcom/include/vkcom.hpp
  5. 116
      modules/dnn/src/vkcom/shader/nary_eltwise_binary_forward.comp
  6. 232
      modules/dnn/src/vkcom/shader/nary_eltwise_binary_forward_spv.cpp
  7. 5
      modules/dnn/src/vkcom/shader/spv_shader.cpp
  8. 5
      modules/dnn/src/vkcom/shader/spv_shader.hpp
  9. 197
      modules/dnn/src/vkcom/src/op_naryEltwise.cpp

@ -848,6 +848,9 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_NaryEltwise, testing::Values(std::make_tuple
#ifdef HAVE_CUDA
INSTANTIATE_TEST_CASE_P(CUDA, Layer_NaryEltwise, testing::Values(std::make_tuple(DNN_BACKEND_CUDA, DNN_TARGET_CUDA)));
#endif
#ifdef HAVE_VULKAN
INSTANTIATE_TEST_CASE_P(VULKAN, Layer_NaryEltwise, testing::Values(std::make_tuple(DNN_BACKEND_VKCOM, DNN_TARGET_VULKAN)));
#endif
INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNorm, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNormExpanded, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
INSTANTIATE_TEST_CASE_P(/**/, Layer_GatherElements, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));

@ -7,6 +7,7 @@
#include "../op_cuda.hpp"
#include "../op_cann.hpp"
#include "../ie_ngraph.hpp"
#include "../op_vkcom.hpp"
#include <opencv2/dnn/shape_utils.hpp>
@ -34,8 +35,141 @@ static int _mod(int x, int y) {
}
}
class NaryEltwiseHelper CV_FINAL
{
public:
int ninputs;
int narrays;
int max_ndims;
std::vector<int> all_ndims;
std::vector<std::vector<int>> orig_shapes;
std::vector<std::vector<size_t>> orig_steps;
std::vector<char*> ptrs;
std::vector<std::vector<int>> shapes;
std::vector<std::vector<size_t>> steps;
NaryEltwiseHelper() {
}
void helperInit(const std::vector<Mat>& inputs, const std::vector<Mat>& outputs)
{
narrays = 0;
max_ndims = 0;
all_ndims.clear();
orig_shapes.clear();
orig_steps.clear();
ptrs.clear();
shapes.clear();
steps.clear();
ninputs = inputs.size();
narrays = ninputs + 1;
// collect ndims
std::vector<int> v_inp_dims;
std::transform(inputs.begin(), inputs.end(), std::back_inserter(v_inp_dims), [] (const Mat& m) { return m.dims; });
const int* inp_ndims = v_inp_dims.data();
int out_ndims = outputs[0].dims;
// find max ndims for broadcasting
int i;
max_ndims = out_ndims > 2 ? out_ndims : 2;
for(i = 0; i < ninputs; i++)
max_ndims = max_ndims > inp_ndims[i] ? max_ndims : inp_ndims[i];
shapes = std::vector<std::vector<int>>(narrays, std::vector<int>(max_ndims, 0));
steps = std::vector<std::vector<size_t>>(narrays, std::vector<size_t>(max_ndims, 0));
ptrs = std::vector<char*>(narrays, nullptr);
for(i = 0; i <= ninputs; i++) {
all_ndims.push_back(i == 0 ? out_ndims : inp_ndims[i-1]);
std::vector<int> _size;
std::vector<size_t> _step;
if (!i) {
std::transform(outputs[0].size.p, outputs[0].size.p + outputs[0].dims, std::back_inserter(_size), [](int s) { return s; });
std::transform(outputs[0].step.p, outputs[0].step.p + outputs[0].dims, std::back_inserter(_step), [](size_t s) { return s; });
}
else {
std::transform(inputs[i-1].size.p, inputs[i-1].size.p + inputs[i-1].dims, std::back_inserter(_size), [](int s) { return s; });
std::transform(inputs[i-1].step.p, inputs[i-1].step.p + inputs[i-1].dims, std::back_inserter(_step), [](size_t s) { return s; });
}
orig_shapes.push_back(_size);
orig_steps.push_back(_step);
}
}
// use FP32 as default type in finalized() function
template <typename T>
bool prepare_for_broadcast_op()
{
int i, j, k;
std::vector<size_t> elemsize(this->all_ndims.size(), sizeof(T));
// step 1.
// * make all inputs and the output max_ndims-dimensional.
// ** prepend dimension 1 to the mat of less dims
// * compute proper step's
for (i = this->max_ndims-1; i >= 0; i--) {
for (k = 0; k < this->narrays; k++) {
j = this->all_ndims[k] - (this->max_ndims - i);
int sz_i = j >= 0 ? this->orig_shapes[k][j] : 1;
size_t st_i = j >= 0 && this->orig_steps[k][j] > 0 ? this->orig_steps[k][j] :
i == this->max_ndims-1 ? elemsize[k] : this->steps[k][i+1]*this->shapes[k][i+1];
assert(st_i % elemsize[k] == 0);
this->shapes[k][i] = sz_i;
this->steps[k][i] = st_i;
if (this->shapes[k][i] == 0)
return false;
}
}
// step 3. Let's do the flattening first,
// since we'd need proper values of steps to check continuity.
// this loop is probably the most tricky part
// in the whole implementation of broadcasting.
j = this->max_ndims-1;
for (i = j - 1; i >= 0; i--) {
bool all_contiguous = true, all_scalars = true, all_consistent = true;
for(k = 0; k < this->narrays; k++) {
size_t st = this->steps[k][j]*this->shapes[k][j];
bool prev_scalar = this->shapes[k][j] == 1;
bool scalar = this->shapes[k][i] == 1;
all_contiguous = all_contiguous && (st == this->steps[k][i]);
all_scalars = all_scalars && scalar;
all_consistent = all_consistent && (scalar == prev_scalar);
}
if (all_contiguous && (all_consistent || all_scalars)) {
for(k = 0; k < this->narrays; k++)
this->shapes[k][j] *= this->shapes[k][i];
} else {
j--;
if (i < j) {
for(k = 0; k < this->narrays; k++) {
this->shapes[k][j] = this->shapes[k][i];
this->steps[k][j] = this->steps[k][i];
}
}
}
}
// step 2. Set some step's to 0's.
for (i = this->max_ndims-1; i >= j; i--) {
for (k = 0; k < this->narrays; k++)
this->steps[k][i] = this->shapes[k][i] == 1 ? 0 : this->steps[k][i];
}
for (; i >= 0; i--) {
for (k = 0; k < this->narrays; k++) {
this->steps[k][i] = 0;
this->shapes[k][i] = 1;
}
}
return true;
}
};
class NaryEltwiseLayerImpl CV_FINAL : public NaryEltwiseLayer
{
NaryEltwiseHelper helper;
public:
enum class OPERATION
{
@ -130,6 +264,13 @@ public:
op == OPERATION::MOD ||
op == OPERATION::FMOD
);
#ifdef HAVE_VULKAN
if (backendId == DNN_BACKEND_VKCOM)
return op == OPERATION::ADD || op == OPERATION::PROD || op == OPERATION::SUB ||
op == OPERATION::DIV ;
#endif
if (backendId == DNN_BACKEND_CUDA) {
return op == OPERATION::MAX || op == OPERATION::MIN || op == OPERATION::SUM ||
op == OPERATION::PROD || op == OPERATION::DIV || op == OPERATION::ADD ||
@ -166,72 +307,14 @@ public:
return outShape;
}
static bool prepare_for_broadcast_op(
int narrays, int max_ndims, const size_t* elemsize,
const int* ndims, const int** shape_, const size_t** step_,
int** shape, size_t** step)
{
int i, j, k;
// step 1.
// * make all inputs and the output max_ndims-dimensional.
// ** prepend dimension 1 to the mat of less dims
// * compute proper step's
for (i = max_ndims-1; i >= 0; i-- ) {
for (k = 0; k < narrays; k++) {
j = ndims[k] - (max_ndims - i);
int sz_i = j >= 0 ? shape_[k][j] : 1;
size_t st_i = j >= 0 && step_ && step_[k] && step_[k][j] > 0 ? step_[k][j] :
i == max_ndims-1 ? elemsize[k] : step[k][i+1]*shape[k][i+1];
assert(st_i % elemsize[k] == 0);
shape[k][i] = sz_i;
step[k][i] = st_i;
if (shape[k][i] == 0)
return false;
}
}
// step 3. Let's do the flattening first,
// since we'd need proper values of steps to check continuity.
// this loop is probably the most tricky part
// in the whole implementation of broadcasting.
j = max_ndims-1;
for (i = j - 1; i >= 0; i--) {
bool all_contiguous = true, all_scalars = true, all_consistent = true;
for(k = 0; k < narrays; k++) {
size_t st = step[k][j]*shape[k][j];
bool prev_scalar = shape[k][j] == 1;
bool scalar = shape[k][i] == 1;
all_contiguous = all_contiguous && (st == step[k][i]);
all_scalars = all_scalars && scalar;
all_consistent = all_consistent && (scalar == prev_scalar);
}
if (all_contiguous && (all_consistent || all_scalars)) {
for(k = 0; k < narrays; k++)
shape[k][j] *= shape[k][i];
} else {
j--;
if (i < j) {
for(k = 0; k < narrays; k++) {
shape[k][j] = shape[k][i];
step[k][j] = step[k][i];
}
}
}
}
virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE {
std::vector<Mat> inputs, outputs;
inputs_arr.getMatVector(inputs);
outputs_arr.getMatVector(outputs);
// step 2. Set some step's to 0's.
for (i = max_ndims-1; i >= j; i--) {
for (k = 0; k < narrays; k++)
step[k][i] = shape[k][i] == 1 ? 0 : step[k][i];
}
for (; i >= 0; i--) {
for (k = 0; k < narrays; k++) {
step[k][i] = 0;
shape[k][i] = 1;
}
}
return true;
helper.helperInit(inputs, outputs);
CV_Assert(helper.prepare_for_broadcast_op<float>());
}
bool getMemoryShapes(const std::vector<MatShape> &inputs,
@ -246,10 +329,10 @@ public:
template <typename T, typename Functor>
void binary_forward_impl(
int ndims, const int* shape,
const char* data1, const size_t* step1,
const char* data2, const size_t* step2,
char* data, const size_t* step,
int ndims, const std::vector<int>& shape,
const char* data1, const std::vector<size_t>& step1,
const char* data2, const std::vector<size_t>& step2,
char* data, const std::vector<size_t>& step,
const Functor& op)
{
assert(ndims >= 2);
@ -305,63 +388,18 @@ public:
const Mat& a = inputs[0];
const Mat& b = inputs[1];
Mat& out = outputs[0];
// collect info of inputs and output
const int* in_shape[] = {a.size.p, b.size.p};
const size_t* in_step[] = {a.step.p, b.step.p};
const int* out_shape = out.size.p;
const size_t* out_step = out.step.p;
const int in_ndims[] = {a.dims, b.dims};
int out_ndims = out.dims;
int max_ndims = std::max(a.dims, std::max(b.dims, out.dims));
// buf holds the folllowing for a, b & output:
// * orig_shapes, shapes (result_shape), orig_steps, steps (result_step), 3*4 elements in total
// * shape_buf & step_buf, 3*2*max_ndims elements in total
// * all_ndims, 3*1 elements in total
// * all_type_sizes, 3*1 elements in total
AutoBuffer<size_t> buf(3 * (2 * max_ndims + 6));
int** orig_shapes = (int**)(buf.data());
int** shapes = orig_shapes + 3;
size_t** orig_steps = (size_t**)(shapes + 3);
size_t** steps = orig_steps + 3;
int* shape_buf = (int*)(steps + 3);
size_t* step_buf = (size_t*)(shape_buf + 3 * max_ndims);
int* all_ndims = (int*)(step_buf + 3 * max_ndims);
size_t* all_type_sizes = (size_t*)(all_ndims + 3);
// assign orig_shapes, shapes, orig_steps, steps, all_ndims, all_type_sizes
for (int i = 0; i < 3; i++)
{
orig_shapes[i] = (int*)(i == 0 ? out_shape : in_shape[i-1]);
orig_steps[i] = (size_t*)(i == 0 ? out_step : in_step[i-1]);
shapes[i] = shape_buf + i * max_ndims;
steps[i] = step_buf + i * max_ndims;
all_ndims[i] = i == 0 ? out_ndims : in_ndims[i-1];
all_type_sizes[i] = sizeof(T);
}
if (!prepare_for_broadcast_op(3, max_ndims, all_type_sizes,
all_ndims, (const int**)orig_shapes,
(const size_t**)orig_steps,
shapes, steps))
return;
CV_Assert(helper.shapes.size() == 3 && helper.steps.size() == 3);
binary_forward_impl<T, Functor>(
max_ndims, shapes[0], a.ptr<char>(), steps[1],
b.ptr<char>(), steps[2], out.ptr<char>(), steps[0],
helper.max_ndims, helper.shapes[0], a.ptr<char>(), helper.steps[1],
b.ptr<char>(), helper.steps[2], out.ptr<char>(), helper.steps[0],
f);
}
template<typename T, typename Functor>
void nary_forward_impl(
const Functor& f, const T scale, int ninputs, int ndims, const int* shape,
const Functor& f, const T scale, int ninputs, int ndims, const std::vector<int>& shape,
const char** inp, char* out,
const size_t** steps, char** ptrs)
const std::vector<std::vector<size_t>>& steps, std::vector<char*>& ptrs)
{
CV_Assert(ndims >= 2);
size_t dp = steps[0][ndims-1]/sizeof(T);
@ -446,77 +484,16 @@ public:
const std::vector<Mat>& inputs, std::vector<Mat>& outputs
)
{
int ninputs = inputs.size();
// collect all input
// collect all input info
std::vector<const char*> v_inp;
std::transform(inputs.begin(), inputs.end(), std::back_inserter(v_inp), [] (const Mat& m) { return m.template ptr<const char>(); });
const char** inp = v_inp.data();
// collect ndims of all input
std::vector<int> v_inp_dims;
std::transform(inputs.begin(), inputs.end(), std::back_inserter(v_inp_dims), [] (const Mat& m) { return m.dims; });
const int* inp_ndims = v_inp_dims.data();
// collect shapes of all input
std::vector<const int*> v_inp_shape;
std::transform(inputs.begin(), inputs.end(), std::back_inserter(v_inp_shape), [] (const Mat& m) { return m.size.p; });
const int** inp_shape = v_inp_shape.data();
// collect steps of all input
std::vector<const size_t*> v_inp_step;
std::transform(inputs.begin(), inputs.end(), std::back_inserter(v_inp_step), [] (const Mat& m) { return m.step.p; });
const size_t** inp_step = v_inp_step.data();
// collect info of output (ndims, shape, step)
// collect output info
char* out = outputs[0].ptr<char>();
int out_ndims = outputs[0].dims;
const int* out_shape = outputs[0].size.p;
const size_t* out_step = outputs[0].step.p;
// find max ndims for broadcasting
int i, max_ndims = out_ndims > 2 ? out_ndims : 2;
for(i = 0; i < ninputs; i++)
max_ndims = max_ndims > inp_ndims[i] ? max_ndims : inp_ndims[i];
// buf holds the following buffers for inputs & output:
// * orig_shapes, shapes (result_shape), orig_steps, steps (result_step), (ninputs+1)*4 elements in total
// * ptrs, (ninputs+1)*1 elements in total
// * shape_buf & step_buf, (ninputs+1)*2*max_ndims elements in total
// * all_ndims, (ninputs+1)*1 elements in total
// * all_type_sizes, (ninputs+1)*1 elements in total
AutoBuffer<size_t> buf((ninputs + 1) * (2 * max_ndims + 7));
int** orig_shapes = (int**)buf.data();
int** shapes = orig_shapes + ninputs + 1;
size_t** orig_steps = (size_t**)(shapes + ninputs + 1);
size_t** steps = orig_steps + ninputs + 1;
char** ptrs = (char**)(steps + ninputs + 1);
size_t* step_buf = (size_t*)(ptrs + ninputs + 1);
int* shape_buf = (int*)(step_buf + (ninputs + 1)*max_ndims);
int* all_ndims = shape_buf + (ninputs + 1)*max_ndims;
size_t* all_type_sizes = (size_t*)(all_ndims + ninputs + 1);
for(i = 0; i <= ninputs; i++) {
all_ndims[i] = i == 0 ? out_ndims : inp_ndims[i-1];
all_type_sizes[i] = sizeof(T);
orig_shapes[i] = (int*)(i == 0 ? out_shape : inp_shape ? inp_shape[i-1] : 0);
orig_steps[i] = (size_t*)(i == 0 ? out_step : inp_step ? inp_step[i-1] : 0);
shapes[i] = shape_buf + max_ndims*i;
steps[i] = step_buf + max_ndims*i;
}
if (!prepare_for_broadcast_op(ninputs + 1, max_ndims, all_type_sizes,
all_ndims, (const int**)orig_shapes,
(const size_t**)orig_steps,
shapes, steps))
return;
nary_forward_impl<T>(
f, scale, ninputs, max_ndims, shapes[0], inp, out, (const size_t **) steps, ptrs);
f, scale, helper.ninputs, helper.max_ndims, helper.shapes[0], inp, out, helper.steps, helper.ptrs);
}
template <typename T, typename Functor>
@ -527,59 +504,21 @@ public:
const Mat& c = inputs[2];
Mat& out = outputs[0];
// collect info of inputs and output
const int* in_shape[] = {a.size.p, b.size.p, c.size.p};
const size_t* in_step[] = {a.step.p, b.step.p, c.step.p};
const int* out_shape = out.size.p;
const size_t* out_step = out.step.p;
const int in_ndims[] = {a.dims, b.dims, c.dims};
int out_ndims = out.dims;
int max_ndims = std::max(a.dims, std::max(b.dims, std::max(c.dims, out.dims)));
AutoBuffer<size_t> buf(4 * (2 * max_ndims + 6));
int** orig_shapes = (int**)(buf.data());
int** shapes = orig_shapes + 4;
size_t** orig_steps = (size_t**)(shapes + 4);
size_t** steps = orig_steps + 4;
int* shape_buf = (int*)(steps + 4);
size_t* step_buf = (size_t*)(shape_buf + 4 * max_ndims);
int* all_ndims = (int*)(step_buf + 4 * max_ndims);
size_t* all_type_sizes = (size_t*)(all_ndims + 4);
// assign orig_shapes, shapes, orig_steps, steps, all_ndims, all_type_sizes
for (int i = 0; i < 4; i++)
{
orig_shapes[i] = (int*)(i == 0 ? out_shape : in_shape[i-1]);
orig_steps[i] = (size_t*)(i == 0 ? out_step : in_step[i-1]);
shapes[i] = shape_buf + i * max_ndims;
steps[i] = step_buf + i * max_ndims;
all_ndims[i] = i == 0 ? out_ndims : in_ndims[i-1];
all_type_sizes[i] = sizeof(T);
}
if (!prepare_for_broadcast_op(4, max_ndims, all_type_sizes,
all_ndims, (const int**)orig_shapes,
(const size_t**)orig_steps,
shapes, steps))
return;
CV_Assert(helper.shapes.size() == 4 && helper.steps.size() == 4);
trinary_forward_impl<T, Functor>(
max_ndims, shapes[0], a.ptr<char>(), steps[1], b.ptr<char>(), steps[2],
c.ptr<char>(), steps[3], out.ptr<char>(), steps[0],
helper.max_ndims, helper.shapes[0], a.ptr<char>(), helper.steps[1], b.ptr<char>(), helper.steps[2],
c.ptr<char>(), helper.steps[3], out.ptr<char>(), helper.steps[0],
f);
}
template <typename T, typename Functor>
void trinary_forward_impl(
int ndims, const int* shape,
const char* data1, const size_t* step1,
const char* data2, const size_t* step2,
const char* data3, const size_t* step3,
char* data, const size_t* step,
int ndims, const std::vector<int>& shape,
const char* data1, const std::vector<size_t>& step1,
const char* data2, const std::vector<size_t>& step2,
const char* data3, const std::vector<size_t>& step3,
char* data, const std::vector<size_t>& step,
const Functor& op)
{
assert(ndims >= 2);
@ -795,6 +734,11 @@ public:
{
case CV_8U:
opDispatch<uint8_t>(std::forward<Args>(args)...);
helper.prepare_for_broadcast_op<uint8_t>();
/*
recompute broadcasted shapes
because default type is FP32 which is calculated in finalize() function
*/
break;
case CV_32S:
opDispatch<int32_t>(std::forward<Args>(args)...);
@ -954,6 +898,16 @@ public:
return Ptr<BackendNode>(new InfEngineNgraphNode(node));
}
#endif
#ifdef HAVE_VULKAN
virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &inputs,
std::vector<Ptr<BackendWrapper> > &outputs) CV_OVERRIDE
{
Ptr<vkcom::OpBase> op = makePtr<vkcom::OpNary>((vkcom::OpNary::OPERATION) this->op, helper.ninputs, helper.max_ndims, helper.shapes, helper.steps);
return Ptr<BackendNode>(makePtr<VkComBackendNode>(inputs, op, outputs));
}
#endif
};
Ptr<NaryEltwiseLayer> NaryEltwiseLayer::create(const LayerParams& params)

@ -0,0 +1,87 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_OP_NARY_HPP
#define OPENCV_OP_NARY_HPP
#include "vkcom.hpp"
#include "op_base.hpp"
namespace cv { namespace dnn { namespace vkcom {
#ifdef HAVE_VULKAN
enum NaryShaderType
{
kNaryShaderTypeBinary,
kNaryShaderTypeTrinary,
kNaryShaderTypeNary,
kNaryShaderTest,
};
struct NaryShaderConfig
{
int local_size_x;
int local_size_y;
int local_size_z;
};
class OpNary : public OpBase
{
public:
// Copied from nary_eltwise_layers.cpp
enum class OPERATION
{
AND = 0,
EQUAL,
GREATER,
GREATER_EQUAL,
LESS,
LESS_EQUAL,
OR,
POW,
XOR,
BITSHIFT,
MAX,
MEAN,
MIN,
MOD,
PROD,
SUB,
SUM,
ADD,
DIV,
WHERE,
};
OpNary(const OPERATION naryOpType, int ninputs, int max_ndims, const std::vector<std::vector<int>> shapes, const std::vector<std::vector<size_t>> steps);
void firstForward(); // Execute only in the first forward.
virtual bool forward(std::vector<Tensor>& ins, std::vector<Tensor>& outs) CV_OVERRIDE;
Ptr<Tensor> weightTensorPtr;
private:
bool computeGroupCount();
bool binaryForward(std::vector<Tensor>& ins, std::vector<Tensor>& outs);
bool trinaryForward(std::vector<Tensor>& ins, std::vector<Tensor>& outs);
bool naryForward(std::vector<Tensor>& ins, std::vector<Tensor>& outs);
const OPERATION naryOpType;
NaryShaderType shaderType;
NaryShaderConfig config;
int ninputs;
int max_ndims;
AutoBuffer<int32_t> shapesBuf;
AutoBuffer<int32_t> stepsBuf;
int nplanes; // number of planes computations are to be performed on
int N2; // value of shape[ndims - 2]
int N1; // value of shape[ndims - 1]
bool firstForwardFinsh = false;
};
#endif // HAVE_VULKAN
}}} // namespace cv::dnn::vkcom
#endif //OPENCV_OP_MATMUL_HPP

@ -51,5 +51,6 @@ bool isAvailable();
#include "op_base.hpp"
#include "op_conv.hpp"
#include "op_matmul.hpp"
#include "op_naryeltwise.hpp"
#endif // OPENCV_DNN_VKCOM_HPP

@ -0,0 +1,116 @@
#version 450
// #extension GL_EXT_debug_printf : enable
#define ALL_THREAD 1024
// #define ALL_THREAD 128 // Experimental batched operation
#define STEP_SIZE 65536
layout(binding = 0) readonly buffer Input1{
float matA[];
};
layout(binding = 1) readonly buffer Input2{
float matB[];
};
layout(binding = 2) writeonly buffer Output{
float matOut[];
};
layout(binding = 3) uniform Params {
int opType;
int ndims;
} params;
layout(binding = 4) readonly buffer Shape {
int shape[];
};
layout(binding = 5) readonly buffer Step {
int matStep[];
};
/* local_size_x, local_size_y, local_size_z there defines the number of invocations
of this compute shader in the current work group. */
// TODO: Check if this makes any sense
// TODO: Check if it is required to fetch PhysicalDeviceLimit from Context
// TODO: here we shall assume that maxGroupInvocation is 1024.
layout(local_size_x = ALL_THREAD, local_size_y = 1, local_size_z = 1) in; // TODO: Check if this makes any sense
const int AND = 0;
const int EQUAL = 1;
const int GREATER = 2;
const int GREATER_EQUAL = 3;
const int LESS = 4;
const int LESS_EQUAL = 5;
const int OR = 6;
const int POW = 7;
const int XOR = 8;
const int BITSHIFT = 9;
const int MAX = 10;
const int MEAN = 11;
const int MIN = 12;
const int MOD = 13;
const int FMOD = 14;
const int PROD = 15;
const int SUB = 16;
const int SUM = 17;
const int ADD = 18;
const int DIV = 19;
const int WHERE = 20;
void binary_forward()
{
int ndims = params.ndims;
int dp1 = matStep[2 * ndims - 1];
int dp2 = matStep[3 * ndims - 1];
int dp = matStep[ndims - 1];
int n1 = shape[ndims - 1], n2 = shape[ndims - 2];
int plane_idx = int(gl_WorkGroupID.x);
int ptr1 = 0;
int ptr2 = 0;
int ptr = 0;
int idx = plane_idx;
for (int k = ndims - 3; k >= 0; --k) {
int next_idx = idx / shape[k];
int i_k = idx - next_idx * shape[k]; // i_k = idx % shape[k]
ptr1 += i_k * matStep[ndims + k];
ptr2 += i_k * matStep[2 * ndims + k];
ptr += i_k * matStep[k];
idx = next_idx;
}
int i2_offset = int(gl_WorkGroupID.y);
int i1_offset = int(gl_LocalInvocationID.x);
ptr1 += i2_offset * matStep[2 * ndims - 2];
ptr2 += i2_offset * matStep[3 * ndims - 2];
ptr += i2_offset * matStep[ndims - 2];
for (int i1 = i1_offset; i1 < n1; i1 += ALL_THREAD) {
switch (params.opType) {
case int(ADD):
matOut[ptr + i1 * dp] = matA[ptr1 + i1 * dp1] + matB[ptr2 + i1 * dp2];
break;
case int(SUB):
matOut[ptr + i1 * dp] = matA[ptr1 + i1 * dp1] - matB[ptr2 + i1 * dp2];
break;
case int(PROD):
matOut[ptr + i1 * dp] = matA[ptr1 + i1 * dp1] * matB[ptr2 + i1 * dp2];
break;
case int(DIV):
matOut[ptr + i1 * dp] = matA[ptr1 + i1 * dp1] / matB[ptr2 + i1 * dp2];
break;
}
}
}
void main()
{
// debugPrintfEXT("nary_eltwise_binary_forward.comp loaded\n");
binary_forward();
return;
}

@ -0,0 +1,232 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "../../precomp.hpp"
namespace cv { namespace dnn { namespace vkcom {
extern const unsigned int nary_eltwise_binary_forward_spv[1757] = {
0x07230203,0x00010000,0x0008000b,0x00000131,0x00000000,0x00020011,0x00000001,0x0006000b,
0x00000001,0x4c534c47,0x6474732e,0x3035342e,0x00000000,0x0003000e,0x00000000,0x00000001,
0x0007000f,0x00000005,0x00000004,0x6e69616d,0x00000000,0x0000003c,0x00000083,0x00060010,
0x00000004,0x00000011,0x00000400,0x00000001,0x00000001,0x00030003,0x00000002,0x000001c2,
0x00040005,0x00000004,0x6e69616d,0x00000000,0x00060005,0x00000006,0x616e6962,0x665f7972,
0x6177726f,0x00286472,0x00040005,0x0000000a,0x6d69646e,0x00000073,0x00040005,0x0000000b,
0x61726150,0x0000736d,0x00050006,0x0000000b,0x00000000,0x7954706f,0x00006570,0x00050006,
0x0000000b,0x00000001,0x6d69646e,0x00000073,0x00040005,0x0000000d,0x61726170,0x0000736d,
0x00030005,0x00000012,0x00317064,0x00040005,0x00000014,0x70657453,0x00000000,0x00050006,
0x00000014,0x00000000,0x5374616d,0x00706574,0x00030005,0x00000016,0x00000000,0x00030005,
0x0000001e,0x00327064,0x00030005,0x00000025,0x00007064,0x00030005,0x0000002a,0x0000316e,
0x00040005,0x0000002c,0x70616853,0x00000065,0x00050006,0x0000002c,0x00000000,0x70616873,
0x00000065,0x00030005,0x0000002e,0x00000000,0x00030005,0x00000033,0x0000326e,0x00050005,
0x00000038,0x6e616c70,0x64695f65,0x00000078,0x00060005,0x0000003c,0x575f6c67,0x476b726f,
0x70756f72,0x00004449,0x00040005,0x00000042,0x31727470,0x00000000,0x00040005,0x00000043,
0x32727470,0x00000000,0x00030005,0x00000044,0x00727470,0x00030005,0x00000045,0x00786469,
0x00030005,0x00000047,0x0000006b,0x00050005,0x00000052,0x7478656e,0x7864695f,0x00000000,
0x00030005,0x00000058,0x006b5f69,0x00050005,0x0000007d,0x6f5f3269,0x65736666,0x00000074,
0x00050005,0x00000082,0x6f5f3169,0x65736666,0x00000074,0x00080005,0x00000083,0x4c5f6c67,
0x6c61636f,0x6f766e49,0x69746163,0x44496e6f,0x00000000,0x00030005,0x000000a1,0x00003169,
0x00040005,0x000000b4,0x7074754f,0x00007475,0x00050006,0x000000b4,0x00000000,0x4f74616d,
0x00007475,0x00030005,0x000000b6,0x00000000,0x00040005,0x000000bd,0x75706e49,0x00003174,
0x00050006,0x000000bd,0x00000000,0x4174616d,0x00000000,0x00030005,0x000000bf,0x00000000,
0x00040005,0x000000c9,0x75706e49,0x00003274,0x00050006,0x000000c9,0x00000000,0x4274616d,
0x00000000,0x00030005,0x000000cb,0x00000000,0x00050048,0x0000000b,0x00000000,0x00000023,
0x00000000,0x00050048,0x0000000b,0x00000001,0x00000023,0x00000004,0x00030047,0x0000000b,
0x00000002,0x00040047,0x0000000d,0x00000022,0x00000000,0x00040047,0x0000000d,0x00000021,
0x00000003,0x00040047,0x00000013,0x00000006,0x00000004,0x00040048,0x00000014,0x00000000,
0x00000018,0x00050048,0x00000014,0x00000000,0x00000023,0x00000000,0x00030047,0x00000014,
0x00000003,0x00040047,0x00000016,0x00000022,0x00000000,0x00040047,0x00000016,0x00000021,
0x00000005,0x00040047,0x0000002b,0x00000006,0x00000004,0x00040048,0x0000002c,0x00000000,
0x00000018,0x00050048,0x0000002c,0x00000000,0x00000023,0x00000000,0x00030047,0x0000002c,
0x00000003,0x00040047,0x0000002e,0x00000022,0x00000000,0x00040047,0x0000002e,0x00000021,
0x00000004,0x00040047,0x0000003c,0x0000000b,0x0000001a,0x00040047,0x00000083,0x0000000b,
0x0000001b,0x00040047,0x000000b3,0x00000006,0x00000004,0x00040048,0x000000b4,0x00000000,
0x00000019,0x00050048,0x000000b4,0x00000000,0x00000023,0x00000000,0x00030047,0x000000b4,
0x00000003,0x00040047,0x000000b6,0x00000022,0x00000000,0x00040047,0x000000b6,0x00000021,
0x00000002,0x00040047,0x000000bc,0x00000006,0x00000004,0x00040048,0x000000bd,0x00000000,
0x00000018,0x00050048,0x000000bd,0x00000000,0x00000023,0x00000000,0x00030047,0x000000bd,
0x00000003,0x00040047,0x000000bf,0x00000022,0x00000000,0x00040047,0x000000bf,0x00000021,
0x00000000,0x00040047,0x000000c8,0x00000006,0x00000004,0x00040048,0x000000c9,0x00000000,
0x00000018,0x00050048,0x000000c9,0x00000000,0x00000023,0x00000000,0x00030047,0x000000c9,
0x00000003,0x00040047,0x000000cb,0x00000022,0x00000000,0x00040047,0x000000cb,0x00000021,
0x00000001,0x00040047,0x0000011f,0x0000000b,0x00000019,0x00020013,0x00000002,0x00030021,
0x00000003,0x00000002,0x00040015,0x00000008,0x00000020,0x00000001,0x00040020,0x00000009,
0x00000007,0x00000008,0x0004001e,0x0000000b,0x00000008,0x00000008,0x00040020,0x0000000c,
0x00000002,0x0000000b,0x0004003b,0x0000000c,0x0000000d,0x00000002,0x0004002b,0x00000008,
0x0000000e,0x00000001,0x00040020,0x0000000f,0x00000002,0x00000008,0x0003001d,0x00000013,
0x00000008,0x0003001e,0x00000014,0x00000013,0x00040020,0x00000015,0x00000002,0x00000014,
0x0004003b,0x00000015,0x00000016,0x00000002,0x0004002b,0x00000008,0x00000017,0x00000000,
0x0004002b,0x00000008,0x00000018,0x00000002,0x0004002b,0x00000008,0x0000001f,0x00000003,
0x0003001d,0x0000002b,0x00000008,0x0003001e,0x0000002c,0x0000002b,0x00040020,0x0000002d,
0x00000002,0x0000002c,0x0004003b,0x0000002d,0x0000002e,0x00000002,0x00040015,0x00000039,
0x00000020,0x00000000,0x00040017,0x0000003a,0x00000039,0x00000003,0x00040020,0x0000003b,
0x00000001,0x0000003a,0x0004003b,0x0000003b,0x0000003c,0x00000001,0x0004002b,0x00000039,
0x0000003d,0x00000000,0x00040020,0x0000003e,0x00000001,0x00000039,0x00020014,0x00000050,
0x0004002b,0x00000039,0x0000007e,0x00000001,0x0004003b,0x0000003b,0x00000083,0x00000001,
0x00030016,0x000000b2,0x00000020,0x0003001d,0x000000b3,0x000000b2,0x0003001e,0x000000b4,
0x000000b3,0x00040020,0x000000b5,0x00000002,0x000000b4,0x0004003b,0x000000b5,0x000000b6,
0x00000002,0x0003001d,0x000000bc,0x000000b2,0x0003001e,0x000000bd,0x000000bc,0x00040020,
0x000000be,0x00000002,0x000000bd,0x0004003b,0x000000be,0x000000bf,0x00000002,0x00040020,
0x000000c5,0x00000002,0x000000b2,0x0003001d,0x000000c8,0x000000b2,0x0003001e,0x000000c9,
0x000000c8,0x00040020,0x000000ca,0x00000002,0x000000c9,0x0004003b,0x000000ca,0x000000cb,
0x00000002,0x0004002b,0x00000008,0x00000119,0x00000400,0x0004002b,0x00000039,0x0000011e,
0x00000400,0x0006002c,0x0000003a,0x0000011f,0x0000011e,0x0000007e,0x0000007e,0x0004002b,
0x00000008,0x00000120,0x00000004,0x0004002b,0x00000008,0x00000121,0x00000005,0x0004002b,
0x00000008,0x00000122,0x00000006,0x0004002b,0x00000008,0x00000123,0x00000007,0x0004002b,
0x00000008,0x00000124,0x00000008,0x0004002b,0x00000008,0x00000125,0x00000009,0x0004002b,
0x00000008,0x00000126,0x0000000a,0x0004002b,0x00000008,0x00000127,0x0000000b,0x0004002b,
0x00000008,0x00000128,0x0000000c,0x0004002b,0x00000008,0x00000129,0x0000000d,0x0004002b,
0x00000008,0x0000012a,0x0000000e,0x0004002b,0x00000008,0x0000012b,0x0000000f,0x0004002b,
0x00000008,0x0000012c,0x00000010,0x0004002b,0x00000008,0x0000012d,0x00000011,0x0004002b,
0x00000008,0x0000012e,0x00000012,0x0004002b,0x00000008,0x0000012f,0x00000013,0x0004002b,
0x00000008,0x00000130,0x00000014,0x00050036,0x00000002,0x00000004,0x00000000,0x00000003,
0x000200f8,0x00000005,0x00040039,0x00000002,0x0000011c,0x00000006,0x000100fd,0x00010038,
0x00050036,0x00000002,0x00000006,0x00000000,0x00000003,0x000200f8,0x00000007,0x0004003b,
0x00000009,0x0000000a,0x00000007,0x0004003b,0x00000009,0x00000012,0x00000007,0x0004003b,
0x00000009,0x0000001e,0x00000007,0x0004003b,0x00000009,0x00000025,0x00000007,0x0004003b,
0x00000009,0x0000002a,0x00000007,0x0004003b,0x00000009,0x00000033,0x00000007,0x0004003b,
0x00000009,0x00000038,0x00000007,0x0004003b,0x00000009,0x00000042,0x00000007,0x0004003b,
0x00000009,0x00000043,0x00000007,0x0004003b,0x00000009,0x00000044,0x00000007,0x0004003b,
0x00000009,0x00000045,0x00000007,0x0004003b,0x00000009,0x00000047,0x00000007,0x0004003b,
0x00000009,0x00000052,0x00000007,0x0004003b,0x00000009,0x00000058,0x00000007,0x0004003b,
0x00000009,0x0000007d,0x00000007,0x0004003b,0x00000009,0x00000082,0x00000007,0x0004003b,
0x00000009,0x000000a1,0x00000007,0x00050041,0x0000000f,0x00000010,0x0000000d,0x0000000e,
0x0004003d,0x00000008,0x00000011,0x00000010,0x0003003e,0x0000000a,0x00000011,0x0004003d,
0x00000008,0x00000019,0x0000000a,0x00050084,0x00000008,0x0000001a,0x00000018,0x00000019,
0x00050082,0x00000008,0x0000001b,0x0000001a,0x0000000e,0x00060041,0x0000000f,0x0000001c,
0x00000016,0x00000017,0x0000001b,0x0004003d,0x00000008,0x0000001d,0x0000001c,0x0003003e,
0x00000012,0x0000001d,0x0004003d,0x00000008,0x00000020,0x0000000a,0x00050084,0x00000008,
0x00000021,0x0000001f,0x00000020,0x00050082,0x00000008,0x00000022,0x00000021,0x0000000e,
0x00060041,0x0000000f,0x00000023,0x00000016,0x00000017,0x00000022,0x0004003d,0x00000008,
0x00000024,0x00000023,0x0003003e,0x0000001e,0x00000024,0x0004003d,0x00000008,0x00000026,
0x0000000a,0x00050082,0x00000008,0x00000027,0x00000026,0x0000000e,0x00060041,0x0000000f,
0x00000028,0x00000016,0x00000017,0x00000027,0x0004003d,0x00000008,0x00000029,0x00000028,
0x0003003e,0x00000025,0x00000029,0x0004003d,0x00000008,0x0000002f,0x0000000a,0x00050082,
0x00000008,0x00000030,0x0000002f,0x0000000e,0x00060041,0x0000000f,0x00000031,0x0000002e,
0x00000017,0x00000030,0x0004003d,0x00000008,0x00000032,0x00000031,0x0003003e,0x0000002a,
0x00000032,0x0004003d,0x00000008,0x00000034,0x0000000a,0x00050082,0x00000008,0x00000035,
0x00000034,0x00000018,0x00060041,0x0000000f,0x00000036,0x0000002e,0x00000017,0x00000035,
0x0004003d,0x00000008,0x00000037,0x00000036,0x0003003e,0x00000033,0x00000037,0x00050041,
0x0000003e,0x0000003f,0x0000003c,0x0000003d,0x0004003d,0x00000039,0x00000040,0x0000003f,
0x0004007c,0x00000008,0x00000041,0x00000040,0x0003003e,0x00000038,0x00000041,0x0003003e,
0x00000042,0x00000017,0x0003003e,0x00000043,0x00000017,0x0003003e,0x00000044,0x00000017,
0x0004003d,0x00000008,0x00000046,0x00000038,0x0003003e,0x00000045,0x00000046,0x0004003d,
0x00000008,0x00000048,0x0000000a,0x00050082,0x00000008,0x00000049,0x00000048,0x0000001f,
0x0003003e,0x00000047,0x00000049,0x000200f9,0x0000004a,0x000200f8,0x0000004a,0x000400f6,
0x0000004c,0x0000004d,0x00000000,0x000200f9,0x0000004e,0x000200f8,0x0000004e,0x0004003d,
0x00000008,0x0000004f,0x00000047,0x000500af,0x00000050,0x00000051,0x0000004f,0x00000017,
0x000400fa,0x00000051,0x0000004b,0x0000004c,0x000200f8,0x0000004b,0x0004003d,0x00000008,
0x00000053,0x00000045,0x0004003d,0x00000008,0x00000054,0x00000047,0x00060041,0x0000000f,
0x00000055,0x0000002e,0x00000017,0x00000054,0x0004003d,0x00000008,0x00000056,0x00000055,
0x00050087,0x00000008,0x00000057,0x00000053,0x00000056,0x0003003e,0x00000052,0x00000057,
0x0004003d,0x00000008,0x00000059,0x00000045,0x0004003d,0x00000008,0x0000005a,0x00000052,
0x0004003d,0x00000008,0x0000005b,0x00000047,0x00060041,0x0000000f,0x0000005c,0x0000002e,
0x00000017,0x0000005b,0x0004003d,0x00000008,0x0000005d,0x0000005c,0x00050084,0x00000008,
0x0000005e,0x0000005a,0x0000005d,0x00050082,0x00000008,0x0000005f,0x00000059,0x0000005e,
0x0003003e,0x00000058,0x0000005f,0x0004003d,0x00000008,0x00000060,0x00000058,0x0004003d,
0x00000008,0x00000061,0x0000000a,0x0004003d,0x00000008,0x00000062,0x00000047,0x00050080,
0x00000008,0x00000063,0x00000061,0x00000062,0x00060041,0x0000000f,0x00000064,0x00000016,
0x00000017,0x00000063,0x0004003d,0x00000008,0x00000065,0x00000064,0x00050084,0x00000008,
0x00000066,0x00000060,0x00000065,0x0004003d,0x00000008,0x00000067,0x00000042,0x00050080,
0x00000008,0x00000068,0x00000067,0x00000066,0x0003003e,0x00000042,0x00000068,0x0004003d,
0x00000008,0x00000069,0x00000058,0x0004003d,0x00000008,0x0000006a,0x0000000a,0x00050084,
0x00000008,0x0000006b,0x00000018,0x0000006a,0x0004003d,0x00000008,0x0000006c,0x00000047,
0x00050080,0x00000008,0x0000006d,0x0000006b,0x0000006c,0x00060041,0x0000000f,0x0000006e,
0x00000016,0x00000017,0x0000006d,0x0004003d,0x00000008,0x0000006f,0x0000006e,0x00050084,
0x00000008,0x00000070,0x00000069,0x0000006f,0x0004003d,0x00000008,0x00000071,0x00000043,
0x00050080,0x00000008,0x00000072,0x00000071,0x00000070,0x0003003e,0x00000043,0x00000072,
0x0004003d,0x00000008,0x00000073,0x00000058,0x0004003d,0x00000008,0x00000074,0x00000047,
0x00060041,0x0000000f,0x00000075,0x00000016,0x00000017,0x00000074,0x0004003d,0x00000008,
0x00000076,0x00000075,0x00050084,0x00000008,0x00000077,0x00000073,0x00000076,0x0004003d,
0x00000008,0x00000078,0x00000044,0x00050080,0x00000008,0x00000079,0x00000078,0x00000077,
0x0003003e,0x00000044,0x00000079,0x0004003d,0x00000008,0x0000007a,0x00000052,0x0003003e,
0x00000045,0x0000007a,0x000200f9,0x0000004d,0x000200f8,0x0000004d,0x0004003d,0x00000008,
0x0000007b,0x00000047,0x00050082,0x00000008,0x0000007c,0x0000007b,0x0000000e,0x0003003e,
0x00000047,0x0000007c,0x000200f9,0x0000004a,0x000200f8,0x0000004c,0x00050041,0x0000003e,
0x0000007f,0x0000003c,0x0000007e,0x0004003d,0x00000039,0x00000080,0x0000007f,0x0004007c,
0x00000008,0x00000081,0x00000080,0x0003003e,0x0000007d,0x00000081,0x00050041,0x0000003e,
0x00000084,0x00000083,0x0000003d,0x0004003d,0x00000039,0x00000085,0x00000084,0x0004007c,
0x00000008,0x00000086,0x00000085,0x0003003e,0x00000082,0x00000086,0x0004003d,0x00000008,
0x00000087,0x0000007d,0x0004003d,0x00000008,0x00000088,0x0000000a,0x00050084,0x00000008,
0x00000089,0x00000018,0x00000088,0x00050082,0x00000008,0x0000008a,0x00000089,0x00000018,
0x00060041,0x0000000f,0x0000008b,0x00000016,0x00000017,0x0000008a,0x0004003d,0x00000008,
0x0000008c,0x0000008b,0x00050084,0x00000008,0x0000008d,0x00000087,0x0000008c,0x0004003d,
0x00000008,0x0000008e,0x00000042,0x00050080,0x00000008,0x0000008f,0x0000008e,0x0000008d,
0x0003003e,0x00000042,0x0000008f,0x0004003d,0x00000008,0x00000090,0x0000007d,0x0004003d,
0x00000008,0x00000091,0x0000000a,0x00050084,0x00000008,0x00000092,0x0000001f,0x00000091,
0x00050082,0x00000008,0x00000093,0x00000092,0x00000018,0x00060041,0x0000000f,0x00000094,
0x00000016,0x00000017,0x00000093,0x0004003d,0x00000008,0x00000095,0x00000094,0x00050084,
0x00000008,0x00000096,0x00000090,0x00000095,0x0004003d,0x00000008,0x00000097,0x00000043,
0x00050080,0x00000008,0x00000098,0x00000097,0x00000096,0x0003003e,0x00000043,0x00000098,
0x0004003d,0x00000008,0x00000099,0x0000007d,0x0004003d,0x00000008,0x0000009a,0x0000000a,
0x00050082,0x00000008,0x0000009b,0x0000009a,0x00000018,0x00060041,0x0000000f,0x0000009c,
0x00000016,0x00000017,0x0000009b,0x0004003d,0x00000008,0x0000009d,0x0000009c,0x00050084,
0x00000008,0x0000009e,0x00000099,0x0000009d,0x0004003d,0x00000008,0x0000009f,0x00000044,
0x00050080,0x00000008,0x000000a0,0x0000009f,0x0000009e,0x0003003e,0x00000044,0x000000a0,
0x0004003d,0x00000008,0x000000a2,0x00000082,0x0003003e,0x000000a1,0x000000a2,0x000200f9,
0x000000a3,0x000200f8,0x000000a3,0x000400f6,0x000000a5,0x000000a6,0x00000000,0x000200f9,
0x000000a7,0x000200f8,0x000000a7,0x0004003d,0x00000008,0x000000a8,0x000000a1,0x0004003d,
0x00000008,0x000000a9,0x0000002a,0x000500b1,0x00000050,0x000000aa,0x000000a8,0x000000a9,
0x000400fa,0x000000aa,0x000000a4,0x000000a5,0x000200f8,0x000000a4,0x00050041,0x0000000f,
0x000000ab,0x0000000d,0x00000017,0x0004003d,0x00000008,0x000000ac,0x000000ab,0x000300f7,
0x000000b1,0x00000000,0x000b00fb,0x000000ac,0x000000b1,0x00000012,0x000000ad,0x00000010,
0x000000ae,0x0000000f,0x000000af,0x00000013,0x000000b0,0x000200f8,0x000000ad,0x0004003d,
0x00000008,0x000000b7,0x00000044,0x0004003d,0x00000008,0x000000b8,0x000000a1,0x0004003d,
0x00000008,0x000000b9,0x00000025,0x00050084,0x00000008,0x000000ba,0x000000b8,0x000000b9,
0x00050080,0x00000008,0x000000bb,0x000000b7,0x000000ba,0x0004003d,0x00000008,0x000000c0,
0x00000042,0x0004003d,0x00000008,0x000000c1,0x000000a1,0x0004003d,0x00000008,0x000000c2,
0x00000012,0x00050084,0x00000008,0x000000c3,0x000000c1,0x000000c2,0x00050080,0x00000008,
0x000000c4,0x000000c0,0x000000c3,0x00060041,0x000000c5,0x000000c6,0x000000bf,0x00000017,
0x000000c4,0x0004003d,0x000000b2,0x000000c7,0x000000c6,0x0004003d,0x00000008,0x000000cc,
0x00000043,0x0004003d,0x00000008,0x000000cd,0x000000a1,0x0004003d,0x00000008,0x000000ce,
0x0000001e,0x00050084,0x00000008,0x000000cf,0x000000cd,0x000000ce,0x00050080,0x00000008,
0x000000d0,0x000000cc,0x000000cf,0x00060041,0x000000c5,0x000000d1,0x000000cb,0x00000017,
0x000000d0,0x0004003d,0x000000b2,0x000000d2,0x000000d1,0x00050081,0x000000b2,0x000000d3,
0x000000c7,0x000000d2,0x00060041,0x000000c5,0x000000d4,0x000000b6,0x00000017,0x000000bb,
0x0003003e,0x000000d4,0x000000d3,0x000200f9,0x000000b1,0x000200f8,0x000000ae,0x0004003d,
0x00000008,0x000000d6,0x00000044,0x0004003d,0x00000008,0x000000d7,0x000000a1,0x0004003d,
0x00000008,0x000000d8,0x00000025,0x00050084,0x00000008,0x000000d9,0x000000d7,0x000000d8,
0x00050080,0x00000008,0x000000da,0x000000d6,0x000000d9,0x0004003d,0x00000008,0x000000db,
0x00000042,0x0004003d,0x00000008,0x000000dc,0x000000a1,0x0004003d,0x00000008,0x000000dd,
0x00000012,0x00050084,0x00000008,0x000000de,0x000000dc,0x000000dd,0x00050080,0x00000008,
0x000000df,0x000000db,0x000000de,0x00060041,0x000000c5,0x000000e0,0x000000bf,0x00000017,
0x000000df,0x0004003d,0x000000b2,0x000000e1,0x000000e0,0x0004003d,0x00000008,0x000000e2,
0x00000043,0x0004003d,0x00000008,0x000000e3,0x000000a1,0x0004003d,0x00000008,0x000000e4,
0x0000001e,0x00050084,0x00000008,0x000000e5,0x000000e3,0x000000e4,0x00050080,0x00000008,
0x000000e6,0x000000e2,0x000000e5,0x00060041,0x000000c5,0x000000e7,0x000000cb,0x00000017,
0x000000e6,0x0004003d,0x000000b2,0x000000e8,0x000000e7,0x00050083,0x000000b2,0x000000e9,
0x000000e1,0x000000e8,0x00060041,0x000000c5,0x000000ea,0x000000b6,0x00000017,0x000000da,
0x0003003e,0x000000ea,0x000000e9,0x000200f9,0x000000b1,0x000200f8,0x000000af,0x0004003d,
0x00000008,0x000000ec,0x00000044,0x0004003d,0x00000008,0x000000ed,0x000000a1,0x0004003d,
0x00000008,0x000000ee,0x00000025,0x00050084,0x00000008,0x000000ef,0x000000ed,0x000000ee,
0x00050080,0x00000008,0x000000f0,0x000000ec,0x000000ef,0x0004003d,0x00000008,0x000000f1,
0x00000042,0x0004003d,0x00000008,0x000000f2,0x000000a1,0x0004003d,0x00000008,0x000000f3,
0x00000012,0x00050084,0x00000008,0x000000f4,0x000000f2,0x000000f3,0x00050080,0x00000008,
0x000000f5,0x000000f1,0x000000f4,0x00060041,0x000000c5,0x000000f6,0x000000bf,0x00000017,
0x000000f5,0x0004003d,0x000000b2,0x000000f7,0x000000f6,0x0004003d,0x00000008,0x000000f8,
0x00000043,0x0004003d,0x00000008,0x000000f9,0x000000a1,0x0004003d,0x00000008,0x000000fa,
0x0000001e,0x00050084,0x00000008,0x000000fb,0x000000f9,0x000000fa,0x00050080,0x00000008,
0x000000fc,0x000000f8,0x000000fb,0x00060041,0x000000c5,0x000000fd,0x000000cb,0x00000017,
0x000000fc,0x0004003d,0x000000b2,0x000000fe,0x000000fd,0x00050085,0x000000b2,0x000000ff,
0x000000f7,0x000000fe,0x00060041,0x000000c5,0x00000100,0x000000b6,0x00000017,0x000000f0,
0x0003003e,0x00000100,0x000000ff,0x000200f9,0x000000b1,0x000200f8,0x000000b0,0x0004003d,
0x00000008,0x00000102,0x00000044,0x0004003d,0x00000008,0x00000103,0x000000a1,0x0004003d,
0x00000008,0x00000104,0x00000025,0x00050084,0x00000008,0x00000105,0x00000103,0x00000104,
0x00050080,0x00000008,0x00000106,0x00000102,0x00000105,0x0004003d,0x00000008,0x00000107,
0x00000042,0x0004003d,0x00000008,0x00000108,0x000000a1,0x0004003d,0x00000008,0x00000109,
0x00000012,0x00050084,0x00000008,0x0000010a,0x00000108,0x00000109,0x00050080,0x00000008,
0x0000010b,0x00000107,0x0000010a,0x00060041,0x000000c5,0x0000010c,0x000000bf,0x00000017,
0x0000010b,0x0004003d,0x000000b2,0x0000010d,0x0000010c,0x0004003d,0x00000008,0x0000010e,
0x00000043,0x0004003d,0x00000008,0x0000010f,0x000000a1,0x0004003d,0x00000008,0x00000110,
0x0000001e,0x00050084,0x00000008,0x00000111,0x0000010f,0x00000110,0x00050080,0x00000008,
0x00000112,0x0000010e,0x00000111,0x00060041,0x000000c5,0x00000113,0x000000cb,0x00000017,
0x00000112,0x0004003d,0x000000b2,0x00000114,0x00000113,0x00050088,0x000000b2,0x00000115,
0x0000010d,0x00000114,0x00060041,0x000000c5,0x00000116,0x000000b6,0x00000017,0x00000106,
0x0003003e,0x00000116,0x00000115,0x000200f9,0x000000b1,0x000200f8,0x000000b1,0x000200f9,
0x000000a6,0x000200f8,0x000000a6,0x0004003d,0x00000008,0x0000011a,0x000000a1,0x00050080,
0x00000008,0x0000011b,0x0000011a,0x00000119,0x0003003e,0x000000a1,0x0000011b,0x000200f9,
0x000000a3,0x000200f8,0x000000a5,0x000100fd,0x00010038
};
}}} // namespace cv::dnn::vkcom

@ -12,10 +12,11 @@ std::map<std::string, std::pair<const unsigned int *, size_t> > SPVMaps;
void initSPVMaps()
{
SPVMaps.insert(std::make_pair("conv_1x1_fast_spv", std::make_pair(conv_1x1_fast_spv, 3134)));
SPVMaps.insert(std::make_pair("gemm_spv", std::make_pair(gemm_spv, 2902)));
SPVMaps.insert(std::make_pair("conv_depthwise_spv", std::make_pair(conv_depthwise_spv, 2092)));
SPVMaps.insert(std::make_pair("conv_depthwise_3x3_spv", std::make_pair(conv_depthwise_3x3_spv, 1977)));
SPVMaps.insert(std::make_pair("conv_implicit_gemm_spv", std::make_pair(conv_implicit_gemm_spv, 3565)));
SPVMaps.insert(std::make_pair("conv_depthwise_spv", std::make_pair(conv_depthwise_spv, 2092)));
SPVMaps.insert(std::make_pair("gemm_spv", std::make_pair(gemm_spv, 2902)));
SPVMaps.insert(std::make_pair("nary_eltwise_binary_forward_spv", std::make_pair(nary_eltwise_binary_forward_spv, 1757)));
}
}}} // namespace cv::dnn::vkcom

@ -9,10 +9,11 @@
namespace cv { namespace dnn { namespace vkcom {
extern const unsigned int conv_1x1_fast_spv[3134];
extern const unsigned int gemm_spv[2902];
extern const unsigned int conv_depthwise_spv[2092];
extern const unsigned int conv_depthwise_3x3_spv[1977];
extern const unsigned int conv_implicit_gemm_spv[3565];
extern const unsigned int conv_depthwise_spv[2092];
extern const unsigned int gemm_spv[2902];
extern const unsigned int nary_eltwise_binary_forward_spv[1757];
extern std::map<std::string, std::pair<const unsigned int *, size_t> > SPVMaps;

@ -0,0 +1,197 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "../../precomp.hpp"
#include "internal.hpp"
#include "../include/op_naryeltwise.hpp"
namespace cv { namespace dnn { namespace vkcom {
#ifdef HAVE_VULKAN
#define STEP_SIZE 65536
#define MAX_GROUP_COUNT_X 65535
#define MAX_GROUP_COUNT_Y 65535
#define MAX_GROUP_COUNT_Z 65535
OpNary::OpNary(const OpNary::OPERATION _naryOpType, int _ninputs, int _max_ndims,
const std::vector<std::vector<int>> shapes, const std::vector<std::vector<size_t>> steps)
: naryOpType(_naryOpType), ninputs(_ninputs), max_ndims(_max_ndims)
{
CV_Assert(ninputs > 1);
shapesBuf.resize((ninputs + 1) * max_ndims);
stepsBuf.resize((ninputs + 1) * max_ndims);
for (int i = 0; i <= ninputs; i++)
{
std::copy(shapes[i].begin(), shapes[i].end(), shapesBuf.data() + i * max_ndims);
std::copy(steps[i].begin(), steps[i].end(), stepsBuf.data() + i * max_ndims);
}
// TODO(VK): support more types of operation
switch(naryOpType) {
// case OPERATION::EQUAL:
// case OPERATION::GREATER:
// case OPERATION::GREATER_EQUAL:
// case OPERATION::LESS:
// case OPERATION::LESS_EQUAL:
// case OPERATION::POW:
// case OPERATION::BITSHIFT:
// case OPERATION::MOD:
case OPERATION::PROD:
case OPERATION::SUB:
case OPERATION::ADD:
case OPERATION::DIV:
// case OPERATION::AND:
// case OPERATION::OR:
// case OPERATION::XOR:
{
CV_Assert(ninputs == 2);
CV_Assert(max_ndims >= 2);
shaderType = kNaryShaderTypeBinary;
shader_name = "nary_eltwise_binary_forward_spv";
// TODO(VK): confirm if this makes any sense
nplanes = std::accumulate(shapesBuf.data(), shapesBuf.data() + max_ndims - 2, 1, [](int32_t a, int32_t b) { return a * b; } );
N2 = shapesBuf.data()[max_ndims - 2];
N1 = shapesBuf.data()[max_ndims - 1];
CV_LOG_DEBUG(NULL, "max_ndims="<<max_ndims<<", nplanes="<<nplanes<<", N2="<<N2<<", N1="<<N1);
break;
}
case OPERATION::WHERE:
{
CV_Assert(ninputs == 3);
CV_Assert(max_ndims >= 2);
shaderType = kNaryShaderTypeTrinary;
shader_name = "nary_eltwise_trinary_forward_spv";
break;
}
// case OPERATION::MAX:
// case OPERATION::MEAN:
// case OPERATION::MIN:
case OPERATION::SUM:
{
CV_Assert(max_ndims >= 2);
shaderType = kNaryShaderTypeNary;
shader_name = "nary_eltwise_nary_forward_spv";
break;
}
//TODO(VK) add other cases
default:
CV_Error(Error::StsNotImplemented, "Unsupported nary operation type");
}
// TODO(VK): initialize OpNary class
}
void OpNary::firstForward()
{
if (!firstForwardFinsh)
{
config.local_size_x = 1; // TODO(vk) determine local_size_y if necessary
config.local_size_y = 1; // TODO(vk) determine local_size_y if necessary
config.local_size_z = 1; // TODO(vk) determine local_size_z if necessary
computeGroupCount();
firstForwardFinsh = true;
}
else
return;
}
bool OpNary::binaryForward(std::vector<Tensor>& ins, std::vector<Tensor>& outs)
{
std::vector<int32_t> param = {(int32_t)naryOpType, max_ndims};
std::vector<int32_t> paramSize = {(int32_t)param.size()};
std::vector<int32_t> dimSizes = {(ninputs + 1) * max_ndims};
std::vector<int32_t> actualSteps;
// TODO(VK): compute step for different dtype. Currently this is for kFormatFp32.
actualSteps.resize(stepsBuf.size());
std::transform(stepsBuf.data(), stepsBuf.data() + dimSizes[0], actualSteps.begin(), [](int32_t sz){ return sz / 4; });
Tensor paramTensor = Tensor(reinterpret_cast<const char *>(param.data()), paramSize, kFormatInt32, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT);
Tensor shapeTensor = Tensor(reinterpret_cast<const char *>(shapesBuf.data()), dimSizes, kFormatInt32, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
Tensor stepTensor = Tensor(reinterpret_cast<const char *>(actualSteps.data()), dimSizes, kFormatInt32, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
destTypes = {
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // input1
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // input2
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // out
VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, // param
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // shape
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // step
};
Ptr<Pipeline> pipeline = pipelineFactoryPtr->getPipeline(shader_name, destTypes);
Ptr<CommandBuffer> cmdBuffer = cmdPoolPtr->allocBuffer();
Ptr<Descriptor> desSet = pipeline->createSet();
VkCommandBuffer cmdBufferReal = cmdBuffer->get();
desSet->writeTensor(ins[0], 0);
desSet->writeTensor(ins[1], 1);
desSet->writeTensor(outs[0], 2);
desSet->writeTensor(paramTensor, 3);
desSet->writeTensor(shapeTensor, 4);
desSet->writeTensor(stepTensor, 5);
cmdBuffer->beginRecord();
pipeline->bind(cmdBufferReal, desSet->get());
vkCmdDispatch(cmdBufferReal, group_x_, group_y_, group_z_);
cmdBuffer->endRecord();
cmdPoolPtr->submitAndWait(cmdBufferReal);
return true;
}
bool OpNary::forward(std::vector<Tensor>& ins, std::vector<Tensor>& outs)
{
firstForward();
// TODO(VK): Support more dtypes. Currently only kFormatFp32 is supported.
for (auto &tensor: ins)
{
CV_Assert(tensor.getFormat() == kFormatFp32);
}
for (auto &tensor: outs)
{
CV_Assert(tensor.getFormat() == kFormatFp32);
}
switch(shaderType) {
case kNaryShaderTypeBinary: {
return binaryForward(ins, outs);
break;
}
default:
CV_Error(Error::StsNotImplemented, "Unsupported shader type invoked.");
}
return true;
}
bool OpNary::computeGroupCount()
{
if (shaderType == kNaryShaderTypeBinary)
{
group_x_ = nplanes; // parallelism at plane level
group_y_ = N2;
group_z_ = 1;
}
else
{
CV_Error(CV_StsNotImplemented, "shader type is not supported at compute GroupCount.");
}
CV_Assert(group_x_ <= MAX_GROUP_COUNT_X);
CV_Assert(group_y_ <= MAX_GROUP_COUNT_Y);
CV_Assert(group_z_ <= MAX_GROUP_COUNT_Z);
return true;
}
#endif // HAVE_VULKAN
}}} // namespace cv::dnn::vkcom
Loading…
Cancel
Save