From 87f749277dbfd04907ac6b205573dee1f539c85a Mon Sep 17 00:00:00 2001 From: Haosonn <90189584+Haosonn@users.noreply.github.com> Date: Mon, 29 Jan 2024 23:41:49 +0800 Subject: [PATCH] Merge pull request #24768 from Haosonn:pre-pr-2 Vulkan backend for NaryEltwiseLayer in DNN module #24768 We improve Vulkan backend for ``NaryEltwiseLayer`` in DNN module by: - add a basic framework for Vulkan backend in ``NaryEltwiseLayer`` - add a compute shader for binary forwarding (an imitation of what has been done in native OpenCV backend including broadcasting and eltwise-operation) - typo fixed: - Wrong info output in ``context.cpp`` Currently, our implementation (or all layers supporting Vulkan backend) runs pretty slow on discrete GPUs basically due to IO cost in function ``copyToHost``, and we are going to fix that by - find out the best ``VkMemoryProperty`` for various discrete GPUs - prevent ``copyToHost`` in middle layers during forwarding, (i.e keep data in GPU memory) ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake Co-authored-by: IskXCr --- modules/dnn/perf/perf_layer.cpp | 3 + .../dnn/src/layers/nary_eltwise_layers.cpp | 410 ++++++++---------- .../dnn/src/vkcom/include/op_naryeltwise.hpp | 87 ++++ modules/dnn/src/vkcom/include/vkcom.hpp | 1 + .../shader/nary_eltwise_binary_forward.comp | 116 +++++ .../nary_eltwise_binary_forward_spv.cpp | 232 ++++++++++ modules/dnn/src/vkcom/shader/spv_shader.cpp | 5 +- modules/dnn/src/vkcom/shader/spv_shader.hpp | 5 +- modules/dnn/src/vkcom/src/op_naryEltwise.cpp | 197 +++++++++ 9 files changed, 824 insertions(+), 232 deletions(-) create mode 100644 modules/dnn/src/vkcom/include/op_naryeltwise.hpp create mode 100644 modules/dnn/src/vkcom/shader/nary_eltwise_binary_forward.comp create mode 100644 modules/dnn/src/vkcom/shader/nary_eltwise_binary_forward_spv.cpp create mode 100644 modules/dnn/src/vkcom/src/op_naryEltwise.cpp diff --git a/modules/dnn/perf/perf_layer.cpp b/modules/dnn/perf/perf_layer.cpp index 27fe7d1504..acdc778b3c 100644 --- a/modules/dnn/perf/perf_layer.cpp +++ b/modules/dnn/perf/perf_layer.cpp @@ -848,6 +848,9 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_NaryEltwise, testing::Values(std::make_tuple #ifdef HAVE_CUDA INSTANTIATE_TEST_CASE_P(CUDA, Layer_NaryEltwise, testing::Values(std::make_tuple(DNN_BACKEND_CUDA, DNN_TARGET_CUDA))); #endif +#ifdef HAVE_VULKAN +INSTANTIATE_TEST_CASE_P(VULKAN, Layer_NaryEltwise, testing::Values(std::make_tuple(DNN_BACKEND_VKCOM, DNN_TARGET_VULKAN))); +#endif INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNorm, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU))); INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNormExpanded, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU))); INSTANTIATE_TEST_CASE_P(/**/, Layer_GatherElements, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU))); diff --git a/modules/dnn/src/layers/nary_eltwise_layers.cpp b/modules/dnn/src/layers/nary_eltwise_layers.cpp index a3f2ba351b..5750766e51 100644 --- a/modules/dnn/src/layers/nary_eltwise_layers.cpp +++ b/modules/dnn/src/layers/nary_eltwise_layers.cpp @@ -7,6 +7,7 @@ #include "../op_cuda.hpp" #include "../op_cann.hpp" #include "../ie_ngraph.hpp" +#include "../op_vkcom.hpp" #include @@ -34,8 +35,141 @@ static int _mod(int x, int y) { } } +class NaryEltwiseHelper CV_FINAL +{ +public: + int ninputs; + int narrays; + int max_ndims; + std::vector all_ndims; + std::vector> orig_shapes; + std::vector> orig_steps; + std::vector ptrs; + std::vector> shapes; + std::vector> steps; + + NaryEltwiseHelper() { + } + + void helperInit(const std::vector& inputs, const std::vector& outputs) + { + narrays = 0; + max_ndims = 0; + all_ndims.clear(); + orig_shapes.clear(); + orig_steps.clear(); + ptrs.clear(); + shapes.clear(); + steps.clear(); + + ninputs = inputs.size(); + narrays = ninputs + 1; + + // collect ndims + std::vector v_inp_dims; + std::transform(inputs.begin(), inputs.end(), std::back_inserter(v_inp_dims), [] (const Mat& m) { return m.dims; }); + const int* inp_ndims = v_inp_dims.data(); + int out_ndims = outputs[0].dims; + + // find max ndims for broadcasting + int i; + max_ndims = out_ndims > 2 ? out_ndims : 2; + for(i = 0; i < ninputs; i++) + max_ndims = max_ndims > inp_ndims[i] ? max_ndims : inp_ndims[i]; + + shapes = std::vector>(narrays, std::vector(max_ndims, 0)); + steps = std::vector>(narrays, std::vector(max_ndims, 0)); + ptrs = std::vector(narrays, nullptr); + + for(i = 0; i <= ninputs; i++) { + all_ndims.push_back(i == 0 ? out_ndims : inp_ndims[i-1]); + std::vector _size; + std::vector _step; + if (!i) { + std::transform(outputs[0].size.p, outputs[0].size.p + outputs[0].dims, std::back_inserter(_size), [](int s) { return s; }); + std::transform(outputs[0].step.p, outputs[0].step.p + outputs[0].dims, std::back_inserter(_step), [](size_t s) { return s; }); + } + else { + std::transform(inputs[i-1].size.p, inputs[i-1].size.p + inputs[i-1].dims, std::back_inserter(_size), [](int s) { return s; }); + std::transform(inputs[i-1].step.p, inputs[i-1].step.p + inputs[i-1].dims, std::back_inserter(_step), [](size_t s) { return s; }); + } + orig_shapes.push_back(_size); + orig_steps.push_back(_step); + } + } + + // use FP32 as default type in finalized() function + template + bool prepare_for_broadcast_op() + { + int i, j, k; + std::vector elemsize(this->all_ndims.size(), sizeof(T)); + + // step 1. + // * make all inputs and the output max_ndims-dimensional. + // ** prepend dimension 1 to the mat of less dims + // * compute proper step's + for (i = this->max_ndims-1; i >= 0; i--) { + for (k = 0; k < this->narrays; k++) { + j = this->all_ndims[k] - (this->max_ndims - i); + int sz_i = j >= 0 ? this->orig_shapes[k][j] : 1; + size_t st_i = j >= 0 && this->orig_steps[k][j] > 0 ? this->orig_steps[k][j] : + i == this->max_ndims-1 ? elemsize[k] : this->steps[k][i+1]*this->shapes[k][i+1]; + assert(st_i % elemsize[k] == 0); + this->shapes[k][i] = sz_i; + this->steps[k][i] = st_i; + if (this->shapes[k][i] == 0) + return false; + } + } + + // step 3. Let's do the flattening first, + // since we'd need proper values of steps to check continuity. + // this loop is probably the most tricky part + // in the whole implementation of broadcasting. + j = this->max_ndims-1; + for (i = j - 1; i >= 0; i--) { + bool all_contiguous = true, all_scalars = true, all_consistent = true; + for(k = 0; k < this->narrays; k++) { + size_t st = this->steps[k][j]*this->shapes[k][j]; + bool prev_scalar = this->shapes[k][j] == 1; + bool scalar = this->shapes[k][i] == 1; + all_contiguous = all_contiguous && (st == this->steps[k][i]); + all_scalars = all_scalars && scalar; + all_consistent = all_consistent && (scalar == prev_scalar); + } + if (all_contiguous && (all_consistent || all_scalars)) { + for(k = 0; k < this->narrays; k++) + this->shapes[k][j] *= this->shapes[k][i]; + } else { + j--; + if (i < j) { + for(k = 0; k < this->narrays; k++) { + this->shapes[k][j] = this->shapes[k][i]; + this->steps[k][j] = this->steps[k][i]; + } + } + } + } + + // step 2. Set some step's to 0's. + for (i = this->max_ndims-1; i >= j; i--) { + for (k = 0; k < this->narrays; k++) + this->steps[k][i] = this->shapes[k][i] == 1 ? 0 : this->steps[k][i]; + } + for (; i >= 0; i--) { + for (k = 0; k < this->narrays; k++) { + this->steps[k][i] = 0; + this->shapes[k][i] = 1; + } + } + return true; + } +}; + class NaryEltwiseLayerImpl CV_FINAL : public NaryEltwiseLayer { + NaryEltwiseHelper helper; public: enum class OPERATION { @@ -130,6 +264,13 @@ public: op == OPERATION::MOD || op == OPERATION::FMOD ); + +#ifdef HAVE_VULKAN + if (backendId == DNN_BACKEND_VKCOM) + return op == OPERATION::ADD || op == OPERATION::PROD || op == OPERATION::SUB || + op == OPERATION::DIV ; +#endif + if (backendId == DNN_BACKEND_CUDA) { return op == OPERATION::MAX || op == OPERATION::MIN || op == OPERATION::SUM || op == OPERATION::PROD || op == OPERATION::DIV || op == OPERATION::ADD || @@ -166,72 +307,14 @@ public: return outShape; } - static bool prepare_for_broadcast_op( - int narrays, int max_ndims, const size_t* elemsize, - const int* ndims, const int** shape_, const size_t** step_, - int** shape, size_t** step) - { - int i, j, k; - - // step 1. - // * make all inputs and the output max_ndims-dimensional. - // ** prepend dimension 1 to the mat of less dims - // * compute proper step's - for (i = max_ndims-1; i >= 0; i-- ) { - for (k = 0; k < narrays; k++) { - j = ndims[k] - (max_ndims - i); - int sz_i = j >= 0 ? shape_[k][j] : 1; - size_t st_i = j >= 0 && step_ && step_[k] && step_[k][j] > 0 ? step_[k][j] : - i == max_ndims-1 ? elemsize[k] : step[k][i+1]*shape[k][i+1]; - assert(st_i % elemsize[k] == 0); - shape[k][i] = sz_i; - step[k][i] = st_i; - if (shape[k][i] == 0) - return false; - } - } - // step 3. Let's do the flattening first, - // since we'd need proper values of steps to check continuity. - // this loop is probably the most tricky part - // in the whole implementation of broadcasting. - j = max_ndims-1; - for (i = j - 1; i >= 0; i--) { - bool all_contiguous = true, all_scalars = true, all_consistent = true; - for(k = 0; k < narrays; k++) { - size_t st = step[k][j]*shape[k][j]; - bool prev_scalar = shape[k][j] == 1; - bool scalar = shape[k][i] == 1; - all_contiguous = all_contiguous && (st == step[k][i]); - all_scalars = all_scalars && scalar; - all_consistent = all_consistent && (scalar == prev_scalar); - } - if (all_contiguous && (all_consistent || all_scalars)) { - for(k = 0; k < narrays; k++) - shape[k][j] *= shape[k][i]; - } else { - j--; - if (i < j) { - for(k = 0; k < narrays; k++) { - shape[k][j] = shape[k][i]; - step[k][j] = step[k][i]; - } - } - } - } + virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE { + std::vector inputs, outputs; + inputs_arr.getMatVector(inputs); + outputs_arr.getMatVector(outputs); - // step 2. Set some step's to 0's. - for (i = max_ndims-1; i >= j; i--) { - for (k = 0; k < narrays; k++) - step[k][i] = shape[k][i] == 1 ? 0 : step[k][i]; - } - for (; i >= 0; i--) { - for (k = 0; k < narrays; k++) { - step[k][i] = 0; - shape[k][i] = 1; - } - } - return true; + helper.helperInit(inputs, outputs); + CV_Assert(helper.prepare_for_broadcast_op()); } bool getMemoryShapes(const std::vector &inputs, @@ -246,10 +329,10 @@ public: template void binary_forward_impl( - int ndims, const int* shape, - const char* data1, const size_t* step1, - const char* data2, const size_t* step2, - char* data, const size_t* step, + int ndims, const std::vector& shape, + const char* data1, const std::vector& step1, + const char* data2, const std::vector& step2, + char* data, const std::vector& step, const Functor& op) { assert(ndims >= 2); @@ -305,63 +388,18 @@ public: const Mat& a = inputs[0]; const Mat& b = inputs[1]; Mat& out = outputs[0]; - - // collect info of inputs and output - const int* in_shape[] = {a.size.p, b.size.p}; - const size_t* in_step[] = {a.step.p, b.step.p}; - const int* out_shape = out.size.p; - const size_t* out_step = out.step.p; - const int in_ndims[] = {a.dims, b.dims}; - int out_ndims = out.dims; - - int max_ndims = std::max(a.dims, std::max(b.dims, out.dims)); - - // buf holds the folllowing for a, b & output: - // * orig_shapes, shapes (result_shape), orig_steps, steps (result_step), 3*4 elements in total - // * shape_buf & step_buf, 3*2*max_ndims elements in total - // * all_ndims, 3*1 elements in total - // * all_type_sizes, 3*1 elements in total - AutoBuffer buf(3 * (2 * max_ndims + 6)); - - int** orig_shapes = (int**)(buf.data()); - int** shapes = orig_shapes + 3; - size_t** orig_steps = (size_t**)(shapes + 3); - size_t** steps = orig_steps + 3; - - int* shape_buf = (int*)(steps + 3); - size_t* step_buf = (size_t*)(shape_buf + 3 * max_ndims); - - int* all_ndims = (int*)(step_buf + 3 * max_ndims); - size_t* all_type_sizes = (size_t*)(all_ndims + 3); - - // assign orig_shapes, shapes, orig_steps, steps, all_ndims, all_type_sizes - for (int i = 0; i < 3; i++) - { - orig_shapes[i] = (int*)(i == 0 ? out_shape : in_shape[i-1]); - orig_steps[i] = (size_t*)(i == 0 ? out_step : in_step[i-1]); - shapes[i] = shape_buf + i * max_ndims; - steps[i] = step_buf + i * max_ndims; - all_ndims[i] = i == 0 ? out_ndims : in_ndims[i-1]; - all_type_sizes[i] = sizeof(T); - } - - if (!prepare_for_broadcast_op(3, max_ndims, all_type_sizes, - all_ndims, (const int**)orig_shapes, - (const size_t**)orig_steps, - shapes, steps)) - return; - + CV_Assert(helper.shapes.size() == 3 && helper.steps.size() == 3); binary_forward_impl( - max_ndims, shapes[0], a.ptr(), steps[1], - b.ptr(), steps[2], out.ptr(), steps[0], + helper.max_ndims, helper.shapes[0], a.ptr(), helper.steps[1], + b.ptr(), helper.steps[2], out.ptr(), helper.steps[0], f); } template void nary_forward_impl( - const Functor& f, const T scale, int ninputs, int ndims, const int* shape, + const Functor& f, const T scale, int ninputs, int ndims, const std::vector& shape, const char** inp, char* out, - const size_t** steps, char** ptrs) + const std::vector>& steps, std::vector& ptrs) { CV_Assert(ndims >= 2); size_t dp = steps[0][ndims-1]/sizeof(T); @@ -446,77 +484,16 @@ public: const std::vector& inputs, std::vector& outputs ) { - int ninputs = inputs.size(); - - // collect all input + // collect all input info std::vector v_inp; std::transform(inputs.begin(), inputs.end(), std::back_inserter(v_inp), [] (const Mat& m) { return m.template ptr(); }); const char** inp = v_inp.data(); - // collect ndims of all input - std::vector v_inp_dims; - std::transform(inputs.begin(), inputs.end(), std::back_inserter(v_inp_dims), [] (const Mat& m) { return m.dims; }); - const int* inp_ndims = v_inp_dims.data(); - - // collect shapes of all input - std::vector v_inp_shape; - std::transform(inputs.begin(), inputs.end(), std::back_inserter(v_inp_shape), [] (const Mat& m) { return m.size.p; }); - const int** inp_shape = v_inp_shape.data(); - - // collect steps of all input - std::vector v_inp_step; - std::transform(inputs.begin(), inputs.end(), std::back_inserter(v_inp_step), [] (const Mat& m) { return m.step.p; }); - const size_t** inp_step = v_inp_step.data(); - - // collect info of output (ndims, shape, step) + // collect output info char* out = outputs[0].ptr(); - int out_ndims = outputs[0].dims; - const int* out_shape = outputs[0].size.p; - const size_t* out_step = outputs[0].step.p; - - // find max ndims for broadcasting - int i, max_ndims = out_ndims > 2 ? out_ndims : 2; - for(i = 0; i < ninputs; i++) - max_ndims = max_ndims > inp_ndims[i] ? max_ndims : inp_ndims[i]; - - // buf holds the following buffers for inputs & output: - // * orig_shapes, shapes (result_shape), orig_steps, steps (result_step), (ninputs+1)*4 elements in total - // * ptrs, (ninputs+1)*1 elements in total - // * shape_buf & step_buf, (ninputs+1)*2*max_ndims elements in total - // * all_ndims, (ninputs+1)*1 elements in total - // * all_type_sizes, (ninputs+1)*1 elements in total - AutoBuffer buf((ninputs + 1) * (2 * max_ndims + 7)); - - int** orig_shapes = (int**)buf.data(); - int** shapes = orig_shapes + ninputs + 1; - size_t** orig_steps = (size_t**)(shapes + ninputs + 1); - size_t** steps = orig_steps + ninputs + 1; - - char** ptrs = (char**)(steps + ninputs + 1); - - size_t* step_buf = (size_t*)(ptrs + ninputs + 1); - int* shape_buf = (int*)(step_buf + (ninputs + 1)*max_ndims); - - int* all_ndims = shape_buf + (ninputs + 1)*max_ndims; - size_t* all_type_sizes = (size_t*)(all_ndims + ninputs + 1); - - for(i = 0; i <= ninputs; i++) { - all_ndims[i] = i == 0 ? out_ndims : inp_ndims[i-1]; - all_type_sizes[i] = sizeof(T); - orig_shapes[i] = (int*)(i == 0 ? out_shape : inp_shape ? inp_shape[i-1] : 0); - orig_steps[i] = (size_t*)(i == 0 ? out_step : inp_step ? inp_step[i-1] : 0); - shapes[i] = shape_buf + max_ndims*i; - steps[i] = step_buf + max_ndims*i; - } - - if (!prepare_for_broadcast_op(ninputs + 1, max_ndims, all_type_sizes, - all_ndims, (const int**)orig_shapes, - (const size_t**)orig_steps, - shapes, steps)) - return; nary_forward_impl( - f, scale, ninputs, max_ndims, shapes[0], inp, out, (const size_t **) steps, ptrs); + f, scale, helper.ninputs, helper.max_ndims, helper.shapes[0], inp, out, helper.steps, helper.ptrs); } template @@ -527,59 +504,21 @@ public: const Mat& c = inputs[2]; Mat& out = outputs[0]; - // collect info of inputs and output - const int* in_shape[] = {a.size.p, b.size.p, c.size.p}; - const size_t* in_step[] = {a.step.p, b.step.p, c.step.p}; - const int* out_shape = out.size.p; - const size_t* out_step = out.step.p; - const int in_ndims[] = {a.dims, b.dims, c.dims}; - int out_ndims = out.dims; - - int max_ndims = std::max(a.dims, std::max(b.dims, std::max(c.dims, out.dims))); - - AutoBuffer buf(4 * (2 * max_ndims + 6)); - - int** orig_shapes = (int**)(buf.data()); - int** shapes = orig_shapes + 4; - size_t** orig_steps = (size_t**)(shapes + 4); - size_t** steps = orig_steps + 4; - - int* shape_buf = (int*)(steps + 4); - size_t* step_buf = (size_t*)(shape_buf + 4 * max_ndims); - - int* all_ndims = (int*)(step_buf + 4 * max_ndims); - size_t* all_type_sizes = (size_t*)(all_ndims + 4); - - // assign orig_shapes, shapes, orig_steps, steps, all_ndims, all_type_sizes - for (int i = 0; i < 4; i++) - { - orig_shapes[i] = (int*)(i == 0 ? out_shape : in_shape[i-1]); - orig_steps[i] = (size_t*)(i == 0 ? out_step : in_step[i-1]); - shapes[i] = shape_buf + i * max_ndims; - steps[i] = step_buf + i * max_ndims; - all_ndims[i] = i == 0 ? out_ndims : in_ndims[i-1]; - all_type_sizes[i] = sizeof(T); - } - - if (!prepare_for_broadcast_op(4, max_ndims, all_type_sizes, - all_ndims, (const int**)orig_shapes, - (const size_t**)orig_steps, - shapes, steps)) - return; + CV_Assert(helper.shapes.size() == 4 && helper.steps.size() == 4); trinary_forward_impl( - max_ndims, shapes[0], a.ptr(), steps[1], b.ptr(), steps[2], - c.ptr(), steps[3], out.ptr(), steps[0], + helper.max_ndims, helper.shapes[0], a.ptr(), helper.steps[1], b.ptr(), helper.steps[2], + c.ptr(), helper.steps[3], out.ptr(), helper.steps[0], f); } template void trinary_forward_impl( - int ndims, const int* shape, - const char* data1, const size_t* step1, - const char* data2, const size_t* step2, - const char* data3, const size_t* step3, - char* data, const size_t* step, + int ndims, const std::vector& shape, + const char* data1, const std::vector& step1, + const char* data2, const std::vector& step2, + const char* data3, const std::vector& step3, + char* data, const std::vector& step, const Functor& op) { assert(ndims >= 2); @@ -795,6 +734,11 @@ public: { case CV_8U: opDispatch(std::forward(args)...); + helper.prepare_for_broadcast_op(); + /* + recompute broadcasted shapes + because default type is FP32 which is calculated in finalize() function + */ break; case CV_32S: opDispatch(std::forward(args)...); @@ -954,6 +898,16 @@ public: return Ptr(new InfEngineNgraphNode(node)); } #endif + +#ifdef HAVE_VULKAN + virtual Ptr initVkCom(const std::vector > &inputs, + std::vector > &outputs) CV_OVERRIDE + { + Ptr op = makePtr((vkcom::OpNary::OPERATION) this->op, helper.ninputs, helper.max_ndims, helper.shapes, helper.steps); + return Ptr(makePtr(inputs, op, outputs)); + } +#endif + }; Ptr NaryEltwiseLayer::create(const LayerParams& params) diff --git a/modules/dnn/src/vkcom/include/op_naryeltwise.hpp b/modules/dnn/src/vkcom/include/op_naryeltwise.hpp new file mode 100644 index 0000000000..1d108298bf --- /dev/null +++ b/modules/dnn/src/vkcom/include/op_naryeltwise.hpp @@ -0,0 +1,87 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_OP_NARY_HPP +#define OPENCV_OP_NARY_HPP + +#include "vkcom.hpp" +#include "op_base.hpp" + +namespace cv { namespace dnn { namespace vkcom { + +#ifdef HAVE_VULKAN + +enum NaryShaderType +{ + kNaryShaderTypeBinary, + kNaryShaderTypeTrinary, + kNaryShaderTypeNary, + kNaryShaderTest, +}; + +struct NaryShaderConfig +{ + int local_size_x; + int local_size_y; + int local_size_z; +}; + + +class OpNary : public OpBase +{ +public: + // Copied from nary_eltwise_layers.cpp + enum class OPERATION + { + AND = 0, + EQUAL, + GREATER, + GREATER_EQUAL, + LESS, + LESS_EQUAL, + OR, + POW, + XOR, + BITSHIFT, + MAX, + MEAN, + MIN, + MOD, + PROD, + SUB, + SUM, + ADD, + DIV, + WHERE, + }; + + OpNary(const OPERATION naryOpType, int ninputs, int max_ndims, const std::vector> shapes, const std::vector> steps); + + void firstForward(); // Execute only in the first forward. + virtual bool forward(std::vector& ins, std::vector& outs) CV_OVERRIDE; + Ptr weightTensorPtr; +private: + bool computeGroupCount(); + bool binaryForward(std::vector& ins, std::vector& outs); + bool trinaryForward(std::vector& ins, std::vector& outs); + bool naryForward(std::vector& ins, std::vector& outs); + + const OPERATION naryOpType; + NaryShaderType shaderType; + NaryShaderConfig config; + int ninputs; + int max_ndims; + AutoBuffer shapesBuf; + AutoBuffer stepsBuf; + int nplanes; // number of planes computations are to be performed on + int N2; // value of shape[ndims - 2] + int N1; // value of shape[ndims - 1] + + bool firstForwardFinsh = false; +}; + +#endif // HAVE_VULKAN + +}}} // namespace cv::dnn::vkcom +#endif //OPENCV_OP_MATMUL_HPP diff --git a/modules/dnn/src/vkcom/include/vkcom.hpp b/modules/dnn/src/vkcom/include/vkcom.hpp index 4c774abfb0..c152a74a1f 100644 --- a/modules/dnn/src/vkcom/include/vkcom.hpp +++ b/modules/dnn/src/vkcom/include/vkcom.hpp @@ -51,5 +51,6 @@ bool isAvailable(); #include "op_base.hpp" #include "op_conv.hpp" #include "op_matmul.hpp" +#include "op_naryeltwise.hpp" #endif // OPENCV_DNN_VKCOM_HPP diff --git a/modules/dnn/src/vkcom/shader/nary_eltwise_binary_forward.comp b/modules/dnn/src/vkcom/shader/nary_eltwise_binary_forward.comp new file mode 100644 index 0000000000..295f157a88 --- /dev/null +++ b/modules/dnn/src/vkcom/shader/nary_eltwise_binary_forward.comp @@ -0,0 +1,116 @@ +#version 450 +// #extension GL_EXT_debug_printf : enable +#define ALL_THREAD 1024 +// #define ALL_THREAD 128 // Experimental batched operation +#define STEP_SIZE 65536 + +layout(binding = 0) readonly buffer Input1{ + float matA[]; +}; + +layout(binding = 1) readonly buffer Input2{ + float matB[]; +}; + +layout(binding = 2) writeonly buffer Output{ + float matOut[]; +}; + +layout(binding = 3) uniform Params { + int opType; + int ndims; +} params; + +layout(binding = 4) readonly buffer Shape { + int shape[]; +}; + +layout(binding = 5) readonly buffer Step { + int matStep[]; +}; + +/* local_size_x, local_size_y, local_size_z there defines the number of invocations + of this compute shader in the current work group. */ +// TODO: Check if this makes any sense +// TODO: Check if it is required to fetch PhysicalDeviceLimit from Context +// TODO: here we shall assume that maxGroupInvocation is 1024. +layout(local_size_x = ALL_THREAD, local_size_y = 1, local_size_z = 1) in; // TODO: Check if this makes any sense + +const int AND = 0; +const int EQUAL = 1; +const int GREATER = 2; +const int GREATER_EQUAL = 3; +const int LESS = 4; +const int LESS_EQUAL = 5; +const int OR = 6; +const int POW = 7; +const int XOR = 8; +const int BITSHIFT = 9; +const int MAX = 10; +const int MEAN = 11; +const int MIN = 12; +const int MOD = 13; +const int FMOD = 14; +const int PROD = 15; +const int SUB = 16; +const int SUM = 17; +const int ADD = 18; +const int DIV = 19; +const int WHERE = 20; + +void binary_forward() +{ + int ndims = params.ndims; + int dp1 = matStep[2 * ndims - 1]; + int dp2 = matStep[3 * ndims - 1]; + int dp = matStep[ndims - 1]; + int n1 = shape[ndims - 1], n2 = shape[ndims - 2]; + + int plane_idx = int(gl_WorkGroupID.x); + + int ptr1 = 0; + int ptr2 = 0; + int ptr = 0; + int idx = plane_idx; + + for (int k = ndims - 3; k >= 0; --k) { + int next_idx = idx / shape[k]; + int i_k = idx - next_idx * shape[k]; // i_k = idx % shape[k] + ptr1 += i_k * matStep[ndims + k]; + ptr2 += i_k * matStep[2 * ndims + k]; + ptr += i_k * matStep[k]; + idx = next_idx; + } + + int i2_offset = int(gl_WorkGroupID.y); + int i1_offset = int(gl_LocalInvocationID.x); + + ptr1 += i2_offset * matStep[2 * ndims - 2]; + ptr2 += i2_offset * matStep[3 * ndims - 2]; + ptr += i2_offset * matStep[ndims - 2]; + + for (int i1 = i1_offset; i1 < n1; i1 += ALL_THREAD) { + switch (params.opType) { + case int(ADD): + matOut[ptr + i1 * dp] = matA[ptr1 + i1 * dp1] + matB[ptr2 + i1 * dp2]; + break; + case int(SUB): + matOut[ptr + i1 * dp] = matA[ptr1 + i1 * dp1] - matB[ptr2 + i1 * dp2]; + break; + case int(PROD): + matOut[ptr + i1 * dp] = matA[ptr1 + i1 * dp1] * matB[ptr2 + i1 * dp2]; + break; + case int(DIV): + matOut[ptr + i1 * dp] = matA[ptr1 + i1 * dp1] / matB[ptr2 + i1 * dp2]; + break; + } + } +} + + +void main() +{ + // debugPrintfEXT("nary_eltwise_binary_forward.comp loaded\n"); + binary_forward(); + return; +} diff --git a/modules/dnn/src/vkcom/shader/nary_eltwise_binary_forward_spv.cpp b/modules/dnn/src/vkcom/shader/nary_eltwise_binary_forward_spv.cpp new file mode 100644 index 0000000000..e4c994a853 --- /dev/null +++ b/modules/dnn/src/vkcom/shader/nary_eltwise_binary_forward_spv.cpp @@ -0,0 +1,232 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "../../precomp.hpp" + +namespace cv { namespace dnn { namespace vkcom { + +extern const unsigned int nary_eltwise_binary_forward_spv[1757] = { + 0x07230203,0x00010000,0x0008000b,0x00000131,0x00000000,0x00020011,0x00000001,0x0006000b, + 0x00000001,0x4c534c47,0x6474732e,0x3035342e,0x00000000,0x0003000e,0x00000000,0x00000001, + 0x0007000f,0x00000005,0x00000004,0x6e69616d,0x00000000,0x0000003c,0x00000083,0x00060010, + 0x00000004,0x00000011,0x00000400,0x00000001,0x00000001,0x00030003,0x00000002,0x000001c2, + 0x00040005,0x00000004,0x6e69616d,0x00000000,0x00060005,0x00000006,0x616e6962,0x665f7972, + 0x6177726f,0x00286472,0x00040005,0x0000000a,0x6d69646e,0x00000073,0x00040005,0x0000000b, + 0x61726150,0x0000736d,0x00050006,0x0000000b,0x00000000,0x7954706f,0x00006570,0x00050006, + 0x0000000b,0x00000001,0x6d69646e,0x00000073,0x00040005,0x0000000d,0x61726170,0x0000736d, + 0x00030005,0x00000012,0x00317064,0x00040005,0x00000014,0x70657453,0x00000000,0x00050006, + 0x00000014,0x00000000,0x5374616d,0x00706574,0x00030005,0x00000016,0x00000000,0x00030005, + 0x0000001e,0x00327064,0x00030005,0x00000025,0x00007064,0x00030005,0x0000002a,0x0000316e, + 0x00040005,0x0000002c,0x70616853,0x00000065,0x00050006,0x0000002c,0x00000000,0x70616873, + 0x00000065,0x00030005,0x0000002e,0x00000000,0x00030005,0x00000033,0x0000326e,0x00050005, + 0x00000038,0x6e616c70,0x64695f65,0x00000078,0x00060005,0x0000003c,0x575f6c67,0x476b726f, + 0x70756f72,0x00004449,0x00040005,0x00000042,0x31727470,0x00000000,0x00040005,0x00000043, + 0x32727470,0x00000000,0x00030005,0x00000044,0x00727470,0x00030005,0x00000045,0x00786469, + 0x00030005,0x00000047,0x0000006b,0x00050005,0x00000052,0x7478656e,0x7864695f,0x00000000, + 0x00030005,0x00000058,0x006b5f69,0x00050005,0x0000007d,0x6f5f3269,0x65736666,0x00000074, + 0x00050005,0x00000082,0x6f5f3169,0x65736666,0x00000074,0x00080005,0x00000083,0x4c5f6c67, + 0x6c61636f,0x6f766e49,0x69746163,0x44496e6f,0x00000000,0x00030005,0x000000a1,0x00003169, + 0x00040005,0x000000b4,0x7074754f,0x00007475,0x00050006,0x000000b4,0x00000000,0x4f74616d, + 0x00007475,0x00030005,0x000000b6,0x00000000,0x00040005,0x000000bd,0x75706e49,0x00003174, + 0x00050006,0x000000bd,0x00000000,0x4174616d,0x00000000,0x00030005,0x000000bf,0x00000000, + 0x00040005,0x000000c9,0x75706e49,0x00003274,0x00050006,0x000000c9,0x00000000,0x4274616d, + 0x00000000,0x00030005,0x000000cb,0x00000000,0x00050048,0x0000000b,0x00000000,0x00000023, + 0x00000000,0x00050048,0x0000000b,0x00000001,0x00000023,0x00000004,0x00030047,0x0000000b, + 0x00000002,0x00040047,0x0000000d,0x00000022,0x00000000,0x00040047,0x0000000d,0x00000021, + 0x00000003,0x00040047,0x00000013,0x00000006,0x00000004,0x00040048,0x00000014,0x00000000, + 0x00000018,0x00050048,0x00000014,0x00000000,0x00000023,0x00000000,0x00030047,0x00000014, + 0x00000003,0x00040047,0x00000016,0x00000022,0x00000000,0x00040047,0x00000016,0x00000021, + 0x00000005,0x00040047,0x0000002b,0x00000006,0x00000004,0x00040048,0x0000002c,0x00000000, + 0x00000018,0x00050048,0x0000002c,0x00000000,0x00000023,0x00000000,0x00030047,0x0000002c, + 0x00000003,0x00040047,0x0000002e,0x00000022,0x00000000,0x00040047,0x0000002e,0x00000021, + 0x00000004,0x00040047,0x0000003c,0x0000000b,0x0000001a,0x00040047,0x00000083,0x0000000b, + 0x0000001b,0x00040047,0x000000b3,0x00000006,0x00000004,0x00040048,0x000000b4,0x00000000, + 0x00000019,0x00050048,0x000000b4,0x00000000,0x00000023,0x00000000,0x00030047,0x000000b4, + 0x00000003,0x00040047,0x000000b6,0x00000022,0x00000000,0x00040047,0x000000b6,0x00000021, + 0x00000002,0x00040047,0x000000bc,0x00000006,0x00000004,0x00040048,0x000000bd,0x00000000, + 0x00000018,0x00050048,0x000000bd,0x00000000,0x00000023,0x00000000,0x00030047,0x000000bd, + 0x00000003,0x00040047,0x000000bf,0x00000022,0x00000000,0x00040047,0x000000bf,0x00000021, + 0x00000000,0x00040047,0x000000c8,0x00000006,0x00000004,0x00040048,0x000000c9,0x00000000, + 0x00000018,0x00050048,0x000000c9,0x00000000,0x00000023,0x00000000,0x00030047,0x000000c9, + 0x00000003,0x00040047,0x000000cb,0x00000022,0x00000000,0x00040047,0x000000cb,0x00000021, + 0x00000001,0x00040047,0x0000011f,0x0000000b,0x00000019,0x00020013,0x00000002,0x00030021, + 0x00000003,0x00000002,0x00040015,0x00000008,0x00000020,0x00000001,0x00040020,0x00000009, + 0x00000007,0x00000008,0x0004001e,0x0000000b,0x00000008,0x00000008,0x00040020,0x0000000c, + 0x00000002,0x0000000b,0x0004003b,0x0000000c,0x0000000d,0x00000002,0x0004002b,0x00000008, + 0x0000000e,0x00000001,0x00040020,0x0000000f,0x00000002,0x00000008,0x0003001d,0x00000013, + 0x00000008,0x0003001e,0x00000014,0x00000013,0x00040020,0x00000015,0x00000002,0x00000014, + 0x0004003b,0x00000015,0x00000016,0x00000002,0x0004002b,0x00000008,0x00000017,0x00000000, + 0x0004002b,0x00000008,0x00000018,0x00000002,0x0004002b,0x00000008,0x0000001f,0x00000003, + 0x0003001d,0x0000002b,0x00000008,0x0003001e,0x0000002c,0x0000002b,0x00040020,0x0000002d, + 0x00000002,0x0000002c,0x0004003b,0x0000002d,0x0000002e,0x00000002,0x00040015,0x00000039, + 0x00000020,0x00000000,0x00040017,0x0000003a,0x00000039,0x00000003,0x00040020,0x0000003b, + 0x00000001,0x0000003a,0x0004003b,0x0000003b,0x0000003c,0x00000001,0x0004002b,0x00000039, + 0x0000003d,0x00000000,0x00040020,0x0000003e,0x00000001,0x00000039,0x00020014,0x00000050, + 0x0004002b,0x00000039,0x0000007e,0x00000001,0x0004003b,0x0000003b,0x00000083,0x00000001, + 0x00030016,0x000000b2,0x00000020,0x0003001d,0x000000b3,0x000000b2,0x0003001e,0x000000b4, + 0x000000b3,0x00040020,0x000000b5,0x00000002,0x000000b4,0x0004003b,0x000000b5,0x000000b6, + 0x00000002,0x0003001d,0x000000bc,0x000000b2,0x0003001e,0x000000bd,0x000000bc,0x00040020, + 0x000000be,0x00000002,0x000000bd,0x0004003b,0x000000be,0x000000bf,0x00000002,0x00040020, + 0x000000c5,0x00000002,0x000000b2,0x0003001d,0x000000c8,0x000000b2,0x0003001e,0x000000c9, + 0x000000c8,0x00040020,0x000000ca,0x00000002,0x000000c9,0x0004003b,0x000000ca,0x000000cb, + 0x00000002,0x0004002b,0x00000008,0x00000119,0x00000400,0x0004002b,0x00000039,0x0000011e, + 0x00000400,0x0006002c,0x0000003a,0x0000011f,0x0000011e,0x0000007e,0x0000007e,0x0004002b, + 0x00000008,0x00000120,0x00000004,0x0004002b,0x00000008,0x00000121,0x00000005,0x0004002b, + 0x00000008,0x00000122,0x00000006,0x0004002b,0x00000008,0x00000123,0x00000007,0x0004002b, + 0x00000008,0x00000124,0x00000008,0x0004002b,0x00000008,0x00000125,0x00000009,0x0004002b, + 0x00000008,0x00000126,0x0000000a,0x0004002b,0x00000008,0x00000127,0x0000000b,0x0004002b, + 0x00000008,0x00000128,0x0000000c,0x0004002b,0x00000008,0x00000129,0x0000000d,0x0004002b, + 0x00000008,0x0000012a,0x0000000e,0x0004002b,0x00000008,0x0000012b,0x0000000f,0x0004002b, + 0x00000008,0x0000012c,0x00000010,0x0004002b,0x00000008,0x0000012d,0x00000011,0x0004002b, + 0x00000008,0x0000012e,0x00000012,0x0004002b,0x00000008,0x0000012f,0x00000013,0x0004002b, + 0x00000008,0x00000130,0x00000014,0x00050036,0x00000002,0x00000004,0x00000000,0x00000003, + 0x000200f8,0x00000005,0x00040039,0x00000002,0x0000011c,0x00000006,0x000100fd,0x00010038, + 0x00050036,0x00000002,0x00000006,0x00000000,0x00000003,0x000200f8,0x00000007,0x0004003b, + 0x00000009,0x0000000a,0x00000007,0x0004003b,0x00000009,0x00000012,0x00000007,0x0004003b, + 0x00000009,0x0000001e,0x00000007,0x0004003b,0x00000009,0x00000025,0x00000007,0x0004003b, + 0x00000009,0x0000002a,0x00000007,0x0004003b,0x00000009,0x00000033,0x00000007,0x0004003b, + 0x00000009,0x00000038,0x00000007,0x0004003b,0x00000009,0x00000042,0x00000007,0x0004003b, + 0x00000009,0x00000043,0x00000007,0x0004003b,0x00000009,0x00000044,0x00000007,0x0004003b, + 0x00000009,0x00000045,0x00000007,0x0004003b,0x00000009,0x00000047,0x00000007,0x0004003b, + 0x00000009,0x00000052,0x00000007,0x0004003b,0x00000009,0x00000058,0x00000007,0x0004003b, + 0x00000009,0x0000007d,0x00000007,0x0004003b,0x00000009,0x00000082,0x00000007,0x0004003b, + 0x00000009,0x000000a1,0x00000007,0x00050041,0x0000000f,0x00000010,0x0000000d,0x0000000e, + 0x0004003d,0x00000008,0x00000011,0x00000010,0x0003003e,0x0000000a,0x00000011,0x0004003d, + 0x00000008,0x00000019,0x0000000a,0x00050084,0x00000008,0x0000001a,0x00000018,0x00000019, + 0x00050082,0x00000008,0x0000001b,0x0000001a,0x0000000e,0x00060041,0x0000000f,0x0000001c, + 0x00000016,0x00000017,0x0000001b,0x0004003d,0x00000008,0x0000001d,0x0000001c,0x0003003e, + 0x00000012,0x0000001d,0x0004003d,0x00000008,0x00000020,0x0000000a,0x00050084,0x00000008, + 0x00000021,0x0000001f,0x00000020,0x00050082,0x00000008,0x00000022,0x00000021,0x0000000e, + 0x00060041,0x0000000f,0x00000023,0x00000016,0x00000017,0x00000022,0x0004003d,0x00000008, + 0x00000024,0x00000023,0x0003003e,0x0000001e,0x00000024,0x0004003d,0x00000008,0x00000026, + 0x0000000a,0x00050082,0x00000008,0x00000027,0x00000026,0x0000000e,0x00060041,0x0000000f, + 0x00000028,0x00000016,0x00000017,0x00000027,0x0004003d,0x00000008,0x00000029,0x00000028, + 0x0003003e,0x00000025,0x00000029,0x0004003d,0x00000008,0x0000002f,0x0000000a,0x00050082, + 0x00000008,0x00000030,0x0000002f,0x0000000e,0x00060041,0x0000000f,0x00000031,0x0000002e, + 0x00000017,0x00000030,0x0004003d,0x00000008,0x00000032,0x00000031,0x0003003e,0x0000002a, + 0x00000032,0x0004003d,0x00000008,0x00000034,0x0000000a,0x00050082,0x00000008,0x00000035, + 0x00000034,0x00000018,0x00060041,0x0000000f,0x00000036,0x0000002e,0x00000017,0x00000035, + 0x0004003d,0x00000008,0x00000037,0x00000036,0x0003003e,0x00000033,0x00000037,0x00050041, + 0x0000003e,0x0000003f,0x0000003c,0x0000003d,0x0004003d,0x00000039,0x00000040,0x0000003f, + 0x0004007c,0x00000008,0x00000041,0x00000040,0x0003003e,0x00000038,0x00000041,0x0003003e, + 0x00000042,0x00000017,0x0003003e,0x00000043,0x00000017,0x0003003e,0x00000044,0x00000017, + 0x0004003d,0x00000008,0x00000046,0x00000038,0x0003003e,0x00000045,0x00000046,0x0004003d, + 0x00000008,0x00000048,0x0000000a,0x00050082,0x00000008,0x00000049,0x00000048,0x0000001f, + 0x0003003e,0x00000047,0x00000049,0x000200f9,0x0000004a,0x000200f8,0x0000004a,0x000400f6, + 0x0000004c,0x0000004d,0x00000000,0x000200f9,0x0000004e,0x000200f8,0x0000004e,0x0004003d, + 0x00000008,0x0000004f,0x00000047,0x000500af,0x00000050,0x00000051,0x0000004f,0x00000017, + 0x000400fa,0x00000051,0x0000004b,0x0000004c,0x000200f8,0x0000004b,0x0004003d,0x00000008, + 0x00000053,0x00000045,0x0004003d,0x00000008,0x00000054,0x00000047,0x00060041,0x0000000f, + 0x00000055,0x0000002e,0x00000017,0x00000054,0x0004003d,0x00000008,0x00000056,0x00000055, + 0x00050087,0x00000008,0x00000057,0x00000053,0x00000056,0x0003003e,0x00000052,0x00000057, + 0x0004003d,0x00000008,0x00000059,0x00000045,0x0004003d,0x00000008,0x0000005a,0x00000052, + 0x0004003d,0x00000008,0x0000005b,0x00000047,0x00060041,0x0000000f,0x0000005c,0x0000002e, + 0x00000017,0x0000005b,0x0004003d,0x00000008,0x0000005d,0x0000005c,0x00050084,0x00000008, + 0x0000005e,0x0000005a,0x0000005d,0x00050082,0x00000008,0x0000005f,0x00000059,0x0000005e, + 0x0003003e,0x00000058,0x0000005f,0x0004003d,0x00000008,0x00000060,0x00000058,0x0004003d, + 0x00000008,0x00000061,0x0000000a,0x0004003d,0x00000008,0x00000062,0x00000047,0x00050080, + 0x00000008,0x00000063,0x00000061,0x00000062,0x00060041,0x0000000f,0x00000064,0x00000016, + 0x00000017,0x00000063,0x0004003d,0x00000008,0x00000065,0x00000064,0x00050084,0x00000008, + 0x00000066,0x00000060,0x00000065,0x0004003d,0x00000008,0x00000067,0x00000042,0x00050080, + 0x00000008,0x00000068,0x00000067,0x00000066,0x0003003e,0x00000042,0x00000068,0x0004003d, + 0x00000008,0x00000069,0x00000058,0x0004003d,0x00000008,0x0000006a,0x0000000a,0x00050084, + 0x00000008,0x0000006b,0x00000018,0x0000006a,0x0004003d,0x00000008,0x0000006c,0x00000047, + 0x00050080,0x00000008,0x0000006d,0x0000006b,0x0000006c,0x00060041,0x0000000f,0x0000006e, + 0x00000016,0x00000017,0x0000006d,0x0004003d,0x00000008,0x0000006f,0x0000006e,0x00050084, + 0x00000008,0x00000070,0x00000069,0x0000006f,0x0004003d,0x00000008,0x00000071,0x00000043, + 0x00050080,0x00000008,0x00000072,0x00000071,0x00000070,0x0003003e,0x00000043,0x00000072, + 0x0004003d,0x00000008,0x00000073,0x00000058,0x0004003d,0x00000008,0x00000074,0x00000047, + 0x00060041,0x0000000f,0x00000075,0x00000016,0x00000017,0x00000074,0x0004003d,0x00000008, + 0x00000076,0x00000075,0x00050084,0x00000008,0x00000077,0x00000073,0x00000076,0x0004003d, + 0x00000008,0x00000078,0x00000044,0x00050080,0x00000008,0x00000079,0x00000078,0x00000077, + 0x0003003e,0x00000044,0x00000079,0x0004003d,0x00000008,0x0000007a,0x00000052,0x0003003e, + 0x00000045,0x0000007a,0x000200f9,0x0000004d,0x000200f8,0x0000004d,0x0004003d,0x00000008, + 0x0000007b,0x00000047,0x00050082,0x00000008,0x0000007c,0x0000007b,0x0000000e,0x0003003e, + 0x00000047,0x0000007c,0x000200f9,0x0000004a,0x000200f8,0x0000004c,0x00050041,0x0000003e, + 0x0000007f,0x0000003c,0x0000007e,0x0004003d,0x00000039,0x00000080,0x0000007f,0x0004007c, + 0x00000008,0x00000081,0x00000080,0x0003003e,0x0000007d,0x00000081,0x00050041,0x0000003e, + 0x00000084,0x00000083,0x0000003d,0x0004003d,0x00000039,0x00000085,0x00000084,0x0004007c, + 0x00000008,0x00000086,0x00000085,0x0003003e,0x00000082,0x00000086,0x0004003d,0x00000008, + 0x00000087,0x0000007d,0x0004003d,0x00000008,0x00000088,0x0000000a,0x00050084,0x00000008, + 0x00000089,0x00000018,0x00000088,0x00050082,0x00000008,0x0000008a,0x00000089,0x00000018, + 0x00060041,0x0000000f,0x0000008b,0x00000016,0x00000017,0x0000008a,0x0004003d,0x00000008, + 0x0000008c,0x0000008b,0x00050084,0x00000008,0x0000008d,0x00000087,0x0000008c,0x0004003d, + 0x00000008,0x0000008e,0x00000042,0x00050080,0x00000008,0x0000008f,0x0000008e,0x0000008d, + 0x0003003e,0x00000042,0x0000008f,0x0004003d,0x00000008,0x00000090,0x0000007d,0x0004003d, + 0x00000008,0x00000091,0x0000000a,0x00050084,0x00000008,0x00000092,0x0000001f,0x00000091, + 0x00050082,0x00000008,0x00000093,0x00000092,0x00000018,0x00060041,0x0000000f,0x00000094, + 0x00000016,0x00000017,0x00000093,0x0004003d,0x00000008,0x00000095,0x00000094,0x00050084, + 0x00000008,0x00000096,0x00000090,0x00000095,0x0004003d,0x00000008,0x00000097,0x00000043, + 0x00050080,0x00000008,0x00000098,0x00000097,0x00000096,0x0003003e,0x00000043,0x00000098, + 0x0004003d,0x00000008,0x00000099,0x0000007d,0x0004003d,0x00000008,0x0000009a,0x0000000a, + 0x00050082,0x00000008,0x0000009b,0x0000009a,0x00000018,0x00060041,0x0000000f,0x0000009c, + 0x00000016,0x00000017,0x0000009b,0x0004003d,0x00000008,0x0000009d,0x0000009c,0x00050084, + 0x00000008,0x0000009e,0x00000099,0x0000009d,0x0004003d,0x00000008,0x0000009f,0x00000044, + 0x00050080,0x00000008,0x000000a0,0x0000009f,0x0000009e,0x0003003e,0x00000044,0x000000a0, + 0x0004003d,0x00000008,0x000000a2,0x00000082,0x0003003e,0x000000a1,0x000000a2,0x000200f9, + 0x000000a3,0x000200f8,0x000000a3,0x000400f6,0x000000a5,0x000000a6,0x00000000,0x000200f9, + 0x000000a7,0x000200f8,0x000000a7,0x0004003d,0x00000008,0x000000a8,0x000000a1,0x0004003d, + 0x00000008,0x000000a9,0x0000002a,0x000500b1,0x00000050,0x000000aa,0x000000a8,0x000000a9, + 0x000400fa,0x000000aa,0x000000a4,0x000000a5,0x000200f8,0x000000a4,0x00050041,0x0000000f, + 0x000000ab,0x0000000d,0x00000017,0x0004003d,0x00000008,0x000000ac,0x000000ab,0x000300f7, + 0x000000b1,0x00000000,0x000b00fb,0x000000ac,0x000000b1,0x00000012,0x000000ad,0x00000010, + 0x000000ae,0x0000000f,0x000000af,0x00000013,0x000000b0,0x000200f8,0x000000ad,0x0004003d, + 0x00000008,0x000000b7,0x00000044,0x0004003d,0x00000008,0x000000b8,0x000000a1,0x0004003d, + 0x00000008,0x000000b9,0x00000025,0x00050084,0x00000008,0x000000ba,0x000000b8,0x000000b9, + 0x00050080,0x00000008,0x000000bb,0x000000b7,0x000000ba,0x0004003d,0x00000008,0x000000c0, + 0x00000042,0x0004003d,0x00000008,0x000000c1,0x000000a1,0x0004003d,0x00000008,0x000000c2, + 0x00000012,0x00050084,0x00000008,0x000000c3,0x000000c1,0x000000c2,0x00050080,0x00000008, + 0x000000c4,0x000000c0,0x000000c3,0x00060041,0x000000c5,0x000000c6,0x000000bf,0x00000017, + 0x000000c4,0x0004003d,0x000000b2,0x000000c7,0x000000c6,0x0004003d,0x00000008,0x000000cc, + 0x00000043,0x0004003d,0x00000008,0x000000cd,0x000000a1,0x0004003d,0x00000008,0x000000ce, + 0x0000001e,0x00050084,0x00000008,0x000000cf,0x000000cd,0x000000ce,0x00050080,0x00000008, + 0x000000d0,0x000000cc,0x000000cf,0x00060041,0x000000c5,0x000000d1,0x000000cb,0x00000017, + 0x000000d0,0x0004003d,0x000000b2,0x000000d2,0x000000d1,0x00050081,0x000000b2,0x000000d3, + 0x000000c7,0x000000d2,0x00060041,0x000000c5,0x000000d4,0x000000b6,0x00000017,0x000000bb, + 0x0003003e,0x000000d4,0x000000d3,0x000200f9,0x000000b1,0x000200f8,0x000000ae,0x0004003d, + 0x00000008,0x000000d6,0x00000044,0x0004003d,0x00000008,0x000000d7,0x000000a1,0x0004003d, + 0x00000008,0x000000d8,0x00000025,0x00050084,0x00000008,0x000000d9,0x000000d7,0x000000d8, + 0x00050080,0x00000008,0x000000da,0x000000d6,0x000000d9,0x0004003d,0x00000008,0x000000db, + 0x00000042,0x0004003d,0x00000008,0x000000dc,0x000000a1,0x0004003d,0x00000008,0x000000dd, + 0x00000012,0x00050084,0x00000008,0x000000de,0x000000dc,0x000000dd,0x00050080,0x00000008, + 0x000000df,0x000000db,0x000000de,0x00060041,0x000000c5,0x000000e0,0x000000bf,0x00000017, + 0x000000df,0x0004003d,0x000000b2,0x000000e1,0x000000e0,0x0004003d,0x00000008,0x000000e2, + 0x00000043,0x0004003d,0x00000008,0x000000e3,0x000000a1,0x0004003d,0x00000008,0x000000e4, + 0x0000001e,0x00050084,0x00000008,0x000000e5,0x000000e3,0x000000e4,0x00050080,0x00000008, + 0x000000e6,0x000000e2,0x000000e5,0x00060041,0x000000c5,0x000000e7,0x000000cb,0x00000017, + 0x000000e6,0x0004003d,0x000000b2,0x000000e8,0x000000e7,0x00050083,0x000000b2,0x000000e9, + 0x000000e1,0x000000e8,0x00060041,0x000000c5,0x000000ea,0x000000b6,0x00000017,0x000000da, + 0x0003003e,0x000000ea,0x000000e9,0x000200f9,0x000000b1,0x000200f8,0x000000af,0x0004003d, + 0x00000008,0x000000ec,0x00000044,0x0004003d,0x00000008,0x000000ed,0x000000a1,0x0004003d, + 0x00000008,0x000000ee,0x00000025,0x00050084,0x00000008,0x000000ef,0x000000ed,0x000000ee, + 0x00050080,0x00000008,0x000000f0,0x000000ec,0x000000ef,0x0004003d,0x00000008,0x000000f1, + 0x00000042,0x0004003d,0x00000008,0x000000f2,0x000000a1,0x0004003d,0x00000008,0x000000f3, + 0x00000012,0x00050084,0x00000008,0x000000f4,0x000000f2,0x000000f3,0x00050080,0x00000008, + 0x000000f5,0x000000f1,0x000000f4,0x00060041,0x000000c5,0x000000f6,0x000000bf,0x00000017, + 0x000000f5,0x0004003d,0x000000b2,0x000000f7,0x000000f6,0x0004003d,0x00000008,0x000000f8, + 0x00000043,0x0004003d,0x00000008,0x000000f9,0x000000a1,0x0004003d,0x00000008,0x000000fa, + 0x0000001e,0x00050084,0x00000008,0x000000fb,0x000000f9,0x000000fa,0x00050080,0x00000008, + 0x000000fc,0x000000f8,0x000000fb,0x00060041,0x000000c5,0x000000fd,0x000000cb,0x00000017, + 0x000000fc,0x0004003d,0x000000b2,0x000000fe,0x000000fd,0x00050085,0x000000b2,0x000000ff, + 0x000000f7,0x000000fe,0x00060041,0x000000c5,0x00000100,0x000000b6,0x00000017,0x000000f0, + 0x0003003e,0x00000100,0x000000ff,0x000200f9,0x000000b1,0x000200f8,0x000000b0,0x0004003d, + 0x00000008,0x00000102,0x00000044,0x0004003d,0x00000008,0x00000103,0x000000a1,0x0004003d, + 0x00000008,0x00000104,0x00000025,0x00050084,0x00000008,0x00000105,0x00000103,0x00000104, + 0x00050080,0x00000008,0x00000106,0x00000102,0x00000105,0x0004003d,0x00000008,0x00000107, + 0x00000042,0x0004003d,0x00000008,0x00000108,0x000000a1,0x0004003d,0x00000008,0x00000109, + 0x00000012,0x00050084,0x00000008,0x0000010a,0x00000108,0x00000109,0x00050080,0x00000008, + 0x0000010b,0x00000107,0x0000010a,0x00060041,0x000000c5,0x0000010c,0x000000bf,0x00000017, + 0x0000010b,0x0004003d,0x000000b2,0x0000010d,0x0000010c,0x0004003d,0x00000008,0x0000010e, + 0x00000043,0x0004003d,0x00000008,0x0000010f,0x000000a1,0x0004003d,0x00000008,0x00000110, + 0x0000001e,0x00050084,0x00000008,0x00000111,0x0000010f,0x00000110,0x00050080,0x00000008, + 0x00000112,0x0000010e,0x00000111,0x00060041,0x000000c5,0x00000113,0x000000cb,0x00000017, + 0x00000112,0x0004003d,0x000000b2,0x00000114,0x00000113,0x00050088,0x000000b2,0x00000115, + 0x0000010d,0x00000114,0x00060041,0x000000c5,0x00000116,0x000000b6,0x00000017,0x00000106, + 0x0003003e,0x00000116,0x00000115,0x000200f9,0x000000b1,0x000200f8,0x000000b1,0x000200f9, + 0x000000a6,0x000200f8,0x000000a6,0x0004003d,0x00000008,0x0000011a,0x000000a1,0x00050080, + 0x00000008,0x0000011b,0x0000011a,0x00000119,0x0003003e,0x000000a1,0x0000011b,0x000200f9, + 0x000000a3,0x000200f8,0x000000a5,0x000100fd,0x00010038 +}; + +}}} // namespace cv::dnn::vkcom diff --git a/modules/dnn/src/vkcom/shader/spv_shader.cpp b/modules/dnn/src/vkcom/shader/spv_shader.cpp index 7f6b9d3ab4..42285e5f77 100644 --- a/modules/dnn/src/vkcom/shader/spv_shader.cpp +++ b/modules/dnn/src/vkcom/shader/spv_shader.cpp @@ -12,10 +12,11 @@ std::map > SPVMaps; void initSPVMaps() { SPVMaps.insert(std::make_pair("conv_1x1_fast_spv", std::make_pair(conv_1x1_fast_spv, 3134))); - SPVMaps.insert(std::make_pair("gemm_spv", std::make_pair(gemm_spv, 2902))); + SPVMaps.insert(std::make_pair("conv_depthwise_spv", std::make_pair(conv_depthwise_spv, 2092))); SPVMaps.insert(std::make_pair("conv_depthwise_3x3_spv", std::make_pair(conv_depthwise_3x3_spv, 1977))); SPVMaps.insert(std::make_pair("conv_implicit_gemm_spv", std::make_pair(conv_implicit_gemm_spv, 3565))); - SPVMaps.insert(std::make_pair("conv_depthwise_spv", std::make_pair(conv_depthwise_spv, 2092))); + SPVMaps.insert(std::make_pair("gemm_spv", std::make_pair(gemm_spv, 2902))); + SPVMaps.insert(std::make_pair("nary_eltwise_binary_forward_spv", std::make_pair(nary_eltwise_binary_forward_spv, 1757))); } }}} // namespace cv::dnn::vkcom diff --git a/modules/dnn/src/vkcom/shader/spv_shader.hpp b/modules/dnn/src/vkcom/shader/spv_shader.hpp index e90cf605c4..1573a92625 100644 --- a/modules/dnn/src/vkcom/shader/spv_shader.hpp +++ b/modules/dnn/src/vkcom/shader/spv_shader.hpp @@ -9,10 +9,11 @@ namespace cv { namespace dnn { namespace vkcom { extern const unsigned int conv_1x1_fast_spv[3134]; -extern const unsigned int gemm_spv[2902]; +extern const unsigned int conv_depthwise_spv[2092]; extern const unsigned int conv_depthwise_3x3_spv[1977]; extern const unsigned int conv_implicit_gemm_spv[3565]; -extern const unsigned int conv_depthwise_spv[2092]; +extern const unsigned int gemm_spv[2902]; +extern const unsigned int nary_eltwise_binary_forward_spv[1757]; extern std::map > SPVMaps; diff --git a/modules/dnn/src/vkcom/src/op_naryEltwise.cpp b/modules/dnn/src/vkcom/src/op_naryEltwise.cpp new file mode 100644 index 0000000000..812ca097b3 --- /dev/null +++ b/modules/dnn/src/vkcom/src/op_naryEltwise.cpp @@ -0,0 +1,197 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "../../precomp.hpp" +#include "internal.hpp" +#include "../include/op_naryeltwise.hpp" + +namespace cv { namespace dnn { namespace vkcom { + +#ifdef HAVE_VULKAN + +#define STEP_SIZE 65536 + +#define MAX_GROUP_COUNT_X 65535 +#define MAX_GROUP_COUNT_Y 65535 +#define MAX_GROUP_COUNT_Z 65535 + +OpNary::OpNary(const OpNary::OPERATION _naryOpType, int _ninputs, int _max_ndims, + const std::vector> shapes, const std::vector> steps) + : naryOpType(_naryOpType), ninputs(_ninputs), max_ndims(_max_ndims) +{ + CV_Assert(ninputs > 1); + + shapesBuf.resize((ninputs + 1) * max_ndims); + stepsBuf.resize((ninputs + 1) * max_ndims); + for (int i = 0; i <= ninputs; i++) + { + std::copy(shapes[i].begin(), shapes[i].end(), shapesBuf.data() + i * max_ndims); + std::copy(steps[i].begin(), steps[i].end(), stepsBuf.data() + i * max_ndims); + } + + // TODO(VK): support more types of operation + switch(naryOpType) { + // case OPERATION::EQUAL: + // case OPERATION::GREATER: + // case OPERATION::GREATER_EQUAL: + // case OPERATION::LESS: + // case OPERATION::LESS_EQUAL: + // case OPERATION::POW: + // case OPERATION::BITSHIFT: + // case OPERATION::MOD: + case OPERATION::PROD: + case OPERATION::SUB: + case OPERATION::ADD: + case OPERATION::DIV: + // case OPERATION::AND: + // case OPERATION::OR: + // case OPERATION::XOR: + { + CV_Assert(ninputs == 2); + CV_Assert(max_ndims >= 2); + shaderType = kNaryShaderTypeBinary; + shader_name = "nary_eltwise_binary_forward_spv"; + + // TODO(VK): confirm if this makes any sense + nplanes = std::accumulate(shapesBuf.data(), shapesBuf.data() + max_ndims - 2, 1, [](int32_t a, int32_t b) { return a * b; } ); + N2 = shapesBuf.data()[max_ndims - 2]; + N1 = shapesBuf.data()[max_ndims - 1]; + CV_LOG_DEBUG(NULL, "max_ndims="<= 2); + shaderType = kNaryShaderTypeNary; + shader_name = "nary_eltwise_nary_forward_spv"; + break; + } + //TODO(VK) add other cases + default: + CV_Error(Error::StsNotImplemented, "Unsupported nary operation type"); + } + // TODO(VK): initialize OpNary class +} + +void OpNary::firstForward() +{ + if (!firstForwardFinsh) + { + config.local_size_x = 1; // TODO(vk) determine local_size_y if necessary + config.local_size_y = 1; // TODO(vk) determine local_size_y if necessary + config.local_size_z = 1; // TODO(vk) determine local_size_z if necessary + computeGroupCount(); + firstForwardFinsh = true; + } + else + return; +} + +bool OpNary::binaryForward(std::vector& ins, std::vector& outs) +{ + std::vector param = {(int32_t)naryOpType, max_ndims}; + std::vector paramSize = {(int32_t)param.size()}; + std::vector dimSizes = {(ninputs + 1) * max_ndims}; + std::vector actualSteps; + + // TODO(VK): compute step for different dtype. Currently this is for kFormatFp32. + actualSteps.resize(stepsBuf.size()); + std::transform(stepsBuf.data(), stepsBuf.data() + dimSizes[0], actualSteps.begin(), [](int32_t sz){ return sz / 4; }); + + Tensor paramTensor = Tensor(reinterpret_cast(param.data()), paramSize, kFormatInt32, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT); + Tensor shapeTensor = Tensor(reinterpret_cast(shapesBuf.data()), dimSizes, kFormatInt32, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT); + Tensor stepTensor = Tensor(reinterpret_cast(actualSteps.data()), dimSizes, kFormatInt32, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT); + + destTypes = { + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // input1 + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // input2 + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // out + VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, // param + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // shape + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // step + }; + + + Ptr pipeline = pipelineFactoryPtr->getPipeline(shader_name, destTypes); + Ptr cmdBuffer = cmdPoolPtr->allocBuffer(); + Ptr desSet = pipeline->createSet(); + VkCommandBuffer cmdBufferReal = cmdBuffer->get(); + + desSet->writeTensor(ins[0], 0); + desSet->writeTensor(ins[1], 1); + desSet->writeTensor(outs[0], 2); + desSet->writeTensor(paramTensor, 3); + desSet->writeTensor(shapeTensor, 4); + desSet->writeTensor(stepTensor, 5); + + cmdBuffer->beginRecord(); + pipeline->bind(cmdBufferReal, desSet->get()); + vkCmdDispatch(cmdBufferReal, group_x_, group_y_, group_z_); + cmdBuffer->endRecord(); + cmdPoolPtr->submitAndWait(cmdBufferReal); + + return true; +} + +bool OpNary::forward(std::vector& ins, std::vector& outs) +{ + + firstForward(); + + // TODO(VK): Support more dtypes. Currently only kFormatFp32 is supported. + for (auto &tensor: ins) + { + CV_Assert(tensor.getFormat() == kFormatFp32); + } + for (auto &tensor: outs) + { + CV_Assert(tensor.getFormat() == kFormatFp32); + } + + switch(shaderType) { + case kNaryShaderTypeBinary: { + return binaryForward(ins, outs); + break; + } + default: + CV_Error(Error::StsNotImplemented, "Unsupported shader type invoked."); + } + + return true; +} + +bool OpNary::computeGroupCount() +{ + if (shaderType == kNaryShaderTypeBinary) + { + group_x_ = nplanes; // parallelism at plane level + group_y_ = N2; + group_z_ = 1; + } + else + { + CV_Error(CV_StsNotImplemented, "shader type is not supported at compute GroupCount."); + } + + CV_Assert(group_x_ <= MAX_GROUP_COUNT_X); + CV_Assert(group_y_ <= MAX_GROUP_COUNT_Y); + CV_Assert(group_z_ <= MAX_GROUP_COUNT_Z); + + return true; +} + +#endif // HAVE_VULKAN + +}}} // namespace cv::dnn::vkcom