Open Source Computer Vision Library https://opencv.org/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

378 lines
16 KiB

// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "../precomp.hpp"
#include "layers_common.hpp"
#include "cpu_kernels/fast_norm.hpp"
// CANN backend
#include "../op_cann.hpp"
// OpenVINO backend
#include "../op_inf_engine.hpp"
#include "../ie_ngraph.hpp"
// CUDA backend
#include "../op_cuda.hpp"
#ifdef HAVE_CUDA
#include "../cuda4dnn/primitives/layer_norm.hpp"
using namespace cv::dnn::cuda4dnn;
#endif
// OpenCL backend
#ifdef HAVE_OPENCL
#include "../ocl4dnn/include/math_functions.hpp"
#include "opencl_kernels_dnn.hpp"
#endif
namespace cv { namespace dnn {
// https://github.com/onnx/onnx/blob/main/docs/Operators.md#LayerNormalization
class LayerNormLayerImpl CV_FINAL : public LayerNormLayer
{
#ifdef HAVE_OPENCL
UMat weight_umat, bias_umat;
#endif
public:
LayerNormLayerImpl(const LayerParams& params)
{
setParamsFrom(params);
// standard attr
axis = params.get<int>("axis", -1);
epsilon = params.get<float>("epsilon", 1e-5);
}
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
#ifdef HAVE_INF_ENGINE
if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
return true;
#endif
return backendId == DNN_BACKEND_OPENCV ||
backendId == DNN_BACKEND_CUDA ||
(backendId == DNN_BACKEND_CANN && axis != -1); // axis=-1 not supported due to 1d mat shape problem
}
virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
const int requiredOutputs,
std::vector<MatShape> &outputs,
std::vector<MatShape> &internals) const CV_OVERRIDE
{
// check shapes of weight and bias if existed
// inputs >= 2 (X and Weight are required, bias is optional)
int num_inputs = inputs.size() + blobs.size();
CV_Check(num_inputs, num_inputs >= 2 && num_inputs <= 3, "LayerNorm: require two (x, weight) or three (x, weight, bias) inputs");
auto x_shape = inputs[0];
int x_ndims = static_cast<int>(x_shape.size());
// Weight and bias are either constants or variable
auto w_shape = blobs.empty() ? inputs[1] : shape(blobs.front());
// if axis == last_dim, scale and b are both 1d tensor (represented as 2d mat nx1)
int w_ndims = static_cast<int>(w_shape.size());
w_ndims = (axis == x_ndims - 1 && w_ndims == 2) ? w_ndims - 1 : w_ndims;
CV_CheckEQ(x_ndims - axis, w_ndims, "LayerNorm: shape of weight does not match with given axis and shape of input");
for (int i = 0; i < w_ndims; ++i)
CV_CheckEQ(x_shape[axis+i], w_shape[i], "LayerNorm: weight dimensions does not match with input dimensions");
if (num_inputs >= 3)
{
auto b_shape = blobs.empty() ? inputs[2] : shape(blobs.back());
CV_CheckEQ(w_shape.size(), b_shape.size(), "LayerNorm: shape of weight does not match with shape of bias");
for (size_t i = 0; i < w_shape.size(); ++i)
CV_CheckEQ(w_shape[i], b_shape[i], "LayerNorm: bias dimensions does not match with weight dimensions");
}
outputs.assign(1, inputs[0]);
return false;
}
virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE {
std::vector<Mat> inputs;
inputs_arr.getMatVector(inputs);
const auto input_shape = shape(inputs[0]);
axis = normalize_axis(axis, static_cast<int>(input_shape.size()));
#ifdef HAVE_OPENCL
weight_umat.release();
bias_umat.release();
#endif
}
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
{
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
forward_ocl(inputs_arr, outputs_arr, internals_arr))
if (inputs_arr.depth() == CV_16F)
{
forward_fallback(inputs_arr, outputs_arr, internals_arr);
return;
}
std::vector<Mat> inputs, outputs;
inputs_arr.getMatVector(inputs);
outputs_arr.getMatVector(outputs);
const auto &input = inputs[0];
const auto &scale = blobs.empty() ? inputs[1] : blobs.front();
auto &output = outputs[0];
if ((inputs.size() + blobs.size()) >= 3) {
const auto &bias = blobs.empty() ? inputs[2] : blobs.back();
fastNorm(input, scale, bias, output, epsilon, static_cast<size_t>(axis));
} else {
fastNorm(input, scale, output, epsilon, static_cast<size_t>(axis));
}
}
#ifdef HAVE_OPENCL
bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_) {
std::vector<UMat> inputs;
std::vector<UMat> outputs;
inputs_.getUMatVector(inputs);
outputs_.getUMatVector(outputs);
const auto &input = inputs[0];
// no fp16 support
if (input.depth() == CV_16F) {
return false;
}
auto &output = outputs[0];
const auto input_shape = shape(input);
size_t loops = static_cast<size_t>(total(input_shape, 0, axis)),
norm_size = static_cast<size_t>(total(input_shape, axis));
float inv_norm_size = 1.f / norm_size;
if (weight_umat.empty()) {
if (blobs.empty()) {
weight_umat = inputs[1];
} else {
blobs.front().copyTo(weight_umat);
}
}
if (bias_umat.empty()) {
if ((inputs.size() + blobs.size()) == 3) {
if (blobs.empty()) {
bias_umat = inputs[2];
} else {
blobs.back().copyTo(bias_umat);
}
} else {
bias_umat = UMat::zeros(norm_size, 1, CV_32F);
}
}
String base_opts = format(" -DT=float -DT4=float4 -Dconvert_T=convert_float4");
// Calculate mean
UMat one = UMat::ones(norm_size, 1, CV_32F);
UMat mean = UMat(loops, 1, CV_32F);
UMat mean_square = UMat(loops, 1, CV_32F);
UMat tmp = UMat(loops, norm_size, CV_32F);
bool ret = ocl4dnn::ocl4dnnGEMV<float>(ocl4dnn::CblasNoTrans, loops, norm_size, inv_norm_size,
input, 0, one, 0, 0.f, mean, 0);
if (!ret) {
return false;
}
// Calculate mean_square
int num_vector = (norm_size % 8 == 0) ? 8 : ((norm_size % 4 == 0) ? 4 : 1);
size_t global[] = {loops, static_cast<size_t>(norm_size / num_vector)};
String build_opt = format(" -DNUM=%d", num_vector) + base_opts;
String mean_square_kernel_name = format("calc_mean%d", num_vector);
ocl::Kernel mean_square_kernel(mean_square_kernel_name.c_str(), ocl::dnn::mvn_oclsrc, build_opt + " -DKERNEL_MEAN");
if (mean_square_kernel.empty()) {
return false;
}
mean_square_kernel.set(0, ocl::KernelArg::PtrReadOnly(input));
mean_square_kernel.set(1, (int)loops);
mean_square_kernel.set(2, (int)norm_size);
mean_square_kernel.set(3, ocl::KernelArg::PtrReadOnly(mean));
mean_square_kernel.set(4, ocl::KernelArg::PtrWriteOnly(tmp));
ret = mean_square_kernel.run(2, global, NULL, false);
if (!ret) {
return false;
}
ret = ocl4dnn::ocl4dnnGEMV<float>(ocl4dnn::CblasNoTrans, loops, norm_size, inv_norm_size,
tmp, 0, one, 0, 0.f, mean_square, 0);
if (!ret) {
return false;
}
// Calculate instance norm: output = weight * (x - mean) / sqrt(var + eps) + bias
String mvn_kernel_name = format("mvn%d", num_vector);
build_opt += " -DNORM_VARIANCE -DLAYER_NORM -DKERNEL_MVN";
ocl::Kernel mvn_kernel(mvn_kernel_name.c_str(), ocl::dnn::mvn_oclsrc, build_opt);
if (mvn_kernel.empty()) {
return false;
}
mvn_kernel.set(0, ocl::KernelArg::PtrReadOnly(input));
mvn_kernel.set(1, (int)loops);
mvn_kernel.set(2, (int)norm_size);
mvn_kernel.set(3, (float)epsilon);
mvn_kernel.set(4, ocl::KernelArg::PtrReadOnly(mean));
mvn_kernel.set(5, ocl::KernelArg::PtrReadOnly(mean_square));
mvn_kernel.set(6, ocl::KernelArg::PtrReadOnly(weight_umat));
mvn_kernel.set(7, ocl::KernelArg::PtrReadOnly(bias_umat));
mvn_kernel.set(8, (int)1);
mvn_kernel.set(9, (float)0.f);
mvn_kernel.set(10, ocl::KernelArg::PtrWriteOnly(output));
ret = mvn_kernel.run(2, global, NULL, false);
if (!ret) {
return false;
}
return true;
}
#endif
#ifdef HAVE_CANN
virtual Ptr<BackendNode> initCann(const std::vector<Ptr<BackendWrapper> > &inputs,
const std::vector<Ptr<BackendWrapper> > &outputs,
const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE {
CV_CheckEQ(inputs.size(), static_cast<size_t>(3), "LayerNorm/CANN: requires three input wrappers");
CV_CheckEQ(nodes.size(), static_cast<size_t>(3), "LayerNorm/CANN: requires three input nodes");
auto input_tensor_wrapper = inputs[0].dynamicCast<CannBackendWrapper>();
auto input_tensor_desc = input_tensor_wrapper->getTensorDesc();
CV_CheckNE(axis, static_cast<int>(input_tensor_desc->GetShape().GetDimNum() - 1), "LayerNorm: CANN does not support axis set as last axis due to 1D mat compatibility issue");
auto last_node = nodes[0].dynamicCast<CannBackendNode>()->getOp();
auto op = std::make_shared<ge::op::LayerNorm>(name);
// set attrs
op->set_attr_begin_norm_axis(axis);
op->set_attr_begin_params_axis(axis);
op->set_attr_epsilon(epsilon);
// set inputs
// set inputs : x
op->set_input_x_by_name(*last_node, input_tensor_wrapper->name.c_str());
op->update_input_desc_x(*input_tensor_desc);
// set inputs : gamma & beta
if (blobs.empty()) {
auto scale_tensor_wrapper = inputs[1].dynamicCast<CannBackendWrapper>();
auto scale_tensor_desc = scale_tensor_wrapper->getTensorDesc();
auto scale_node = nodes[1].dynamicCast<CannBackendNode>()->getOp();
op->set_input_gamma_by_name(*scale_node, scale_tensor_wrapper->name.c_str());
op->update_input_desc_gamma(*scale_tensor_desc);
if (inputs.size() == 3) {
auto bias_tensor_wrapper = inputs[2].dynamicCast<CannBackendWrapper>();
auto bias_tensor_desc = bias_tensor_wrapper->getTensorDesc();
auto bias_node = nodes[2].dynamicCast<CannBackendNode>()->getOp();
op->set_input_beta_by_name(*bias_node, bias_tensor_wrapper->name.c_str());
op->update_input_desc_beta(*bias_tensor_desc);
}
} else {
const auto &scale_mat = blobs.front();
const auto op_const_scale = std::make_shared<CannConstOp>(scale_mat.data, scale_mat.type(), shape(scale_mat), cv::format("%s_w", name.c_str()));
op->set_input_gamma(*(op_const_scale->getOp()));
op->update_input_desc_gamma(*(op_const_scale->getTensorDesc()));
if ((inputs.size() + blobs.size()) >= 3) {
const auto &bias_mat = blobs.back();
const auto op_const_bias = std::make_shared<CannConstOp>(bias_mat.data, bias_mat.type(), shape(bias_mat), cv::format("%s_b", name.c_str()));
op->set_input_beta(*(op_const_bias->getOp()));
op->update_input_desc_beta(*(op_const_bias->getTensorDesc()));
}
}
// set outputs
auto output_desc_y = std::make_shared<ge::TensorDesc>(ge::Shape(), ge::FORMAT_NCHW, ge::DT_FLOAT);
op->update_output_desc_y(*output_desc_y);
auto output_desc_mean = std::make_shared<ge::TensorDesc>(ge::Shape(), ge::FORMAT_NCHW, ge::DT_FLOAT);
op->update_output_desc_mean(*output_desc_mean);
auto output_desc_var = std::make_shared<ge::TensorDesc>(ge::Shape(), ge::FORMAT_NCHW, ge::DT_FLOAT);
op->update_output_desc_variance(*output_desc_var);
return Ptr<BackendNode>(new CannBackendNode(op));
}
#endif // HAVE_CANN
#ifdef HAVE_DNN_NGRAPH
virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE {
auto ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
const auto &input_shape = ieInpNode.get_shape();
std::shared_ptr<ov::Node> mvn, result;
ov::Output<ov::Node> scale, bias;
// mvn
// https://docs.openvino.ai/2023.1/openvino_docs_ops_normalization_MVN_6.html
std::vector<int64_t> axes_v(input_shape.size() - axis);
std::iota(axes_v.begin(), axes_v.end(), axis);
auto axes = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{axes_v.size()}, axes_v.data());
bool normalize_variance = true;
mvn = std::make_shared<ov::op::v6::MVN>(ieInpNode, axes, normalize_variance, epsilon, ov::op::MVNEpsMode::INSIDE_SQRT);
// layer norm = scale * mvn + bias
if (blobs.empty()) {
scale = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
if (nodes.size() == 3) {
bias = nodes[2].dynamicCast<InfEngineNgraphNode>()->node;
}
} else {
auto scale_mat = blobs.front();
const auto scale_shape = shape(scale_mat);
scale = std::make_shared<ov::op::v0::Constant>(ov::element::f32, std::vector<size_t>(scale_shape.begin(), scale_shape.end()), scale_mat.data);
if ((nodes.size() + blobs.size()) == 3) {
auto bias_mat = blobs.back();
const auto bias_shape = shape(bias_mat);
bias = std::make_shared<ov::op::v0::Constant>(ov::element::f32, std::vector<size_t>(bias_shape.begin(), bias_shape.end()), bias_mat.data);
}
}
if (axis == -1 || axis == input_shape.size() - 1) { // special case for 1D tensor (2D mat)
std::vector<int64_t> shared_shape_v(input_shape.size(), 1);
shared_shape_v.back() = -1;
auto shared_shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{shared_shape_v.size()}, shared_shape_v.data());
scale = std::make_shared<ov::op::v1::Reshape>(scale, shared_shape, true);
if ((nodes.size() + blobs.size()) == 3) {
bias = std::make_shared<ov::op::v1::Reshape>(bias, shared_shape, true);
}
}
result = std::make_shared<ov::op::v1::Multiply>(mvn, scale);
if ((nodes.size() + blobs.size()) == 3) {
result = std::make_shared<ov::op::v1::Add>(result, bias);
}
return Ptr<BackendNode>(new InfEngineNgraphNode(result));
}
#endif // HAVE_DNN_NGRAPH
#ifdef HAVE_CUDA
Ptr<BackendNode> initCUDA(void *context_,
const std::vector<Ptr<BackendWrapper>>& inputs,
const std::vector<Ptr<BackendWrapper>>& outputs) override {
auto context = reinterpret_cast<csl::CSLContext*>(context_);
auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
auto input_shape = input_wrapper->getShape();
size_t loops = static_cast<size_t>(total(input_shape, 0, axis));
const auto scale = blobs.empty() ? Mat() : blobs.front(),
bias = blobs.empty() ? Mat() : blobs.back();
return make_cuda_node<cuda4dnn::LayerNormOp>(preferableTarget, std::move(context->stream), scale, bias, axis, epsilon, loops);
}
#endif // HAVE_CUDA
};
Ptr<LayerNormLayer> LayerNormLayer::create(const LayerParams& params)
{
return makePtr<LayerNormLayerImpl>(params);
}
}} // cv::dnn