From f53f491cd2700502dfbeebadff8b7492265e8b80 Mon Sep 17 00:00:00 2001 From: Yashas Samaga B L Date: Sat, 1 Aug 2020 17:33:07 +0530 Subject: [PATCH] Merge pull request #17939 from YashasSamaga:cuda4dnn-fix-eltwise-fusion * fix eltwise fusion segfault, more eltwise fusions, fix power fusion * add assertion --- .../src/cuda4dnn/primitives/convolution.hpp | 26 +++++++++---------- modules/dnn/src/dnn.cpp | 22 +++++----------- modules/dnn/src/layers/convolution_layer.cpp | 14 +++++----- 3 files changed, 25 insertions(+), 37 deletions(-) diff --git a/modules/dnn/src/cuda4dnn/primitives/convolution.hpp b/modules/dnn/src/cuda4dnn/primitives/convolution.hpp index 0129a7ed2a..8d788f05dc 100644 --- a/modules/dnn/src/cuda4dnn/primitives/convolution.hpp +++ b/modules/dnn/src/cuda4dnn/primitives/convolution.hpp @@ -68,7 +68,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { IDENTITY, RELU, /* uses value provided in `relu_negative_slope` */ CLIPPED_RELU, /* uses values provided in `crelu_floor` and `crelu_ceil` */ - POWER, /* scale and shift fused with weights and bias; only `power_exp` is handled here */ + POWER, TANH, SIGMOID, SWISH, @@ -76,7 +76,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { }; ActivationType activation_type; - float relu_negative_slope, crelu_floor, crelu_ceil, power_exp; + float relu_negative_slope, crelu_floor, crelu_ceil; + float power_exp, power_scale, power_shift; }; template @@ -224,10 +225,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { crelu_floor = config.crelu_floor; crelu_ceil = config.crelu_ceil; power_exp = config.power_exp; - - /* the scale and shift parameters of POWER have already been fused with weights and bias */ - if (activation == ConvolutionConfiguration::ActivationType::POWER && power_exp == 1.0f) - activation = ConvolutionConfiguration::ActivationType::IDENTITY; + power_scale = config.power_scale; + power_shift = config.power_shift; /* we normally use cuDNN for convolution and perform bias, activation and eltwise ops ourselves * hence, the activation for cuDNN is IDENTITY by default @@ -383,7 +382,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { kernels::biasN_eltwise_sum_2_clipped_relu_inplace(stream, output, inner_size, biasTensor, eltwise, crelu_floor, crelu_ceil); break; case ConvolutionConfiguration::ActivationType::POWER: - kernels::biasN_eltwise_sum_2_power_inplace(stream, output, inner_size, biasTensor, eltwise, power_exp, 1.0, 0.0); + kernels::biasN_eltwise_sum_2_power_inplace(stream, output, inner_size, biasTensor, eltwise, power_exp, power_scale, power_shift); break; case ConvolutionConfiguration::ActivationType::TANH: kernels::biasN_eltwise_sum_2_tanh_inplace(stream, output, inner_size, biasTensor, eltwise); @@ -414,7 +413,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { kernels::biasN_clipped_relu_eltwise_sum_2_inplace(stream, output, inner_size, biasTensor, eltwise, crelu_floor, crelu_ceil); break; case ConvolutionConfiguration::ActivationType::POWER: - kernels::biasN_power_eltwise_sum_2_inplace(stream, output, inner_size, biasTensor, eltwise, power_exp, 1.0, 0.0); + kernels::biasN_power_eltwise_sum_2_inplace(stream, output, inner_size, biasTensor, eltwise, power_exp, power_scale, power_shift); break; case ConvolutionConfiguration::ActivationType::TANH: kernels::biasN_tanh_eltwise_sum_2_inplace(stream, output, inner_size, biasTensor, eltwise); @@ -450,7 +449,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { kernels::biasN_clipped_relu_inplace(stream, output, inner_size, biasTensor, crelu_floor, crelu_ceil); break; case ConvolutionConfiguration::ActivationType::POWER: - kernels::biasN_power_inplace(stream, output, inner_size, biasTensor, power_exp, 1.0, 0.0); + kernels::biasN_power_inplace(stream, output, inner_size, biasTensor, power_exp, power_scale, power_shift); break; case ConvolutionConfiguration::ActivationType::TANH: kernels::biasN_tanh_inplace(stream, output, inner_size, biasTensor); @@ -497,7 +496,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { kernels::eltwise_sum_2_clipped_relu(stream, output, output, eltwise, crelu_floor, crelu_ceil); break; case ConvolutionConfiguration::ActivationType::POWER: - kernels::eltwise_sum_2_power(stream, output, output, eltwise, power_exp, 1.0, 0.0); + kernels::eltwise_sum_2_power(stream, output, output, eltwise, power_exp, power_scale, power_shift); break; case ConvolutionConfiguration::ActivationType::TANH: kernels::eltwise_sum_2_tanh(stream, output, output, eltwise); @@ -527,7 +526,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { kernels::clipped_relu_eltwise_sum_2_inplace(stream, output, eltwise, crelu_floor, crelu_ceil); break; case ConvolutionConfiguration::ActivationType::POWER: - kernels::power_eltwise_sum_2_inplace(stream, output, eltwise, power_exp, 1.0, 0.0); + kernels::power_eltwise_sum_2_inplace(stream, output, eltwise, power_exp, power_scale, power_shift); break; case ConvolutionConfiguration::ActivationType::TANH: kernels::tanh_eltwise_sum_2_inplace(stream, output, eltwise); @@ -561,7 +560,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { kernels::clipped_relu(stream, output, output, crelu_floor, crelu_ceil); break; case ConvolutionConfiguration::ActivationType::POWER: - kernels::power(stream, output, output, power_exp, 1.0, 0.0); + kernels::power(stream, output, output, power_exp, power_scale, power_shift); break; case ConvolutionConfiguration::ActivationType::TANH: kernels::tanh(stream, output, output); @@ -595,7 +594,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { ConvolutionConfiguration::FusionMode fusion_mode; ConvolutionConfiguration::ActivationType activation; - float relu_negative_slope, crelu_floor, crelu_ceil, power_exp; + float relu_negative_slope, crelu_floor, crelu_ceil; + float power_exp, power_scale, power_shift; enum class InternalFusionLocation { CUDNN, diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp index a04db892fc..3bff866bb8 100644 --- a/modules/dnn/src/dnn.cpp +++ b/modules/dnn/src/dnn.cpp @@ -2656,32 +2656,21 @@ struct Net::Impl : public detail::NetImplBase Ptr nextEltwiseLayer; if( nextData ) nextEltwiseLayer = nextData->layerInstance.dynamicCast(); - #ifdef HAVE_CUDA // CUDA backend supports fusion with eltwise sum (without variable channels) // `nextEltwiseLayer` is reset if eltwise layer doesn't have a compatible configuration for fusion if (IS_DNN_CUDA_TARGET(preferableTarget) && !nextEltwiseLayer.empty()) { // we create a temporary backend node for eltwise layer to obtain the eltwise configuration - cuda4dnn::csl::CSLContext context; // assume that initCUDA and EltwiseOp does not use the context during init + cuda4dnn::csl::CSLContext context; // assume that initCUDA and EltwiseOp do not use the context during init const auto node = nextData->layerInstance->initCUDA(&context, nextData->inputBlobsWrappers, nextData->outputBlobsWrappers); const auto eltwiseNode = node.dynamicCast(); - if (eltwiseNode->op != cuda4dnn::EltwiseOpType::SUM || !eltwiseNode->coeffs.empty()) + // CUDA backend uses EltwiseOp when all operands have the same number of channels; otherwise, ShortcutOp is used. + // Hence, a successful cast to EltwiseOp implies that the number of channels is same in all operand tensors. + if (eltwiseNode.empty() || eltwiseNode->op != cuda4dnn::EltwiseOpType::SUM || !eltwiseNode->coeffs.empty()) nextEltwiseLayer = Ptr(); - - // check for variable channels - auto& inputs = nextData->inputBlobs; - for (int i = 1; i < inputs.size(); ++i) - { - if (inputs[i]->size[1] != inputs[0]->size[1]) - { - nextEltwiseLayer = Ptr(); - break; - } - } } #endif - if (!nextEltwiseLayer.empty() && nextData && nextData->inputBlobsId.size() == 2) { LayerData *eltwiseData = nextData; @@ -2725,7 +2714,8 @@ struct Net::Impl : public detail::NetImplBase { nextData = &layers[eltwiseData->consumers[0].lid]; lpNext = LayerPin(eltwiseData->consumers[0].lid, 0); - if (pinsToKeep.count(lpNext) == 0 && nextData->outputBlobs.size() == 1) + CV_Assert(nextData); + if (nextData->outputBlobs.size() == 1) nextFusabeleActivLayer = nextData->layerInstance.dynamicCast(); } else diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index 2e125b5e95..52d8971a01 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -250,7 +250,8 @@ public: #ifdef HAVE_CUDA cuda4dnn::ConvolutionConfiguration::FusionMode cudaFusionMode; cuda4dnn::ConvolutionConfiguration::ActivationType cudaActType; - float cuda_relu_slope, cuda_crelu_floor, cuda_crelu_ceil, cuda_power_exp; + float cuda_relu_slope, cuda_crelu_floor, cuda_crelu_ceil; + float cuda_power_exp, cuda_power_scale, cuda_power_shift; #endif ConvolutionLayerImpl(const LayerParams ¶ms) : BaseConvolutionLayerImpl(params) @@ -457,13 +458,8 @@ public: Ptr activ_power = activ.dynamicCast(); if (!activ_power.empty()) { - if (activ_power->scale != 1.f || activ_power->shift != 0.f) - { - const int outCh = blobs[0].size[0]; - fuseWeights(Mat(1, outCh, CV_32F, Scalar(activ_power->scale)), - Mat(1, outCh, CV_32F, Scalar(activ_power->shift))); - } - + cuda_power_scale = activ_power->scale; + cuda_power_shift = activ_power->shift; cuda_power_exp = activ_power->power; cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::POWER; } @@ -1591,6 +1587,8 @@ public: config.crelu_floor = cuda_crelu_floor; config.crelu_ceil = cuda_crelu_ceil; config.power_exp = cuda_power_exp; + config.power_scale = cuda_power_scale; + config.power_shift = cuda_power_shift; Mat filtersMat = fusedWeights ? weightsMat : blobs[0]; Mat biasMat = (hasBias() || fusedBias) ? Mat(output_feature_maps, 1, CV_32F, biasvec.data()) : Mat();