From f53f491cd2700502dfbeebadff8b7492265e8b80 Mon Sep 17 00:00:00 2001
From: Yashas Samaga B L <yashas_2010@yahoo.com>
Date: Sat, 1 Aug 2020 17:33:07 +0530
Subject: [PATCH] Merge pull request #17939 from
 YashasSamaga:cuda4dnn-fix-eltwise-fusion

* fix eltwise fusion segfault, more eltwise fusions, fix power fusion

* add assertion
---
 .../src/cuda4dnn/primitives/convolution.hpp   | 26 +++++++++----------
 modules/dnn/src/dnn.cpp                       | 22 +++++-----------
 modules/dnn/src/layers/convolution_layer.cpp  | 14 +++++-----
 3 files changed, 25 insertions(+), 37 deletions(-)
diff --git a/modules/dnn/src/cuda4dnn/primitives/convolution.hpp b/modules/dnn/src/cuda4dnn/primitives/convolution.hpp
index 0129a7ed2a..8d788f05dc 100644
--- a/modules/dnn/src/cuda4dnn/primitives/convolution.hpp
+++ b/modules/dnn/src/cuda4dnn/primitives/convolution.hpp
@@ -68,7 +68,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
             IDENTITY,
             RELU, /* uses value provided in `relu_negative_slope` */
             CLIPPED_RELU, /* uses values provided in `crelu_floor` and `crelu_ceil` */
-            POWER, /* scale and shift fused with weights and bias; only `power_exp` is handled here */
+            POWER,
             TANH,
             SIGMOID,
             SWISH,
@@ -76,7 +76,8 @@ namespace cv { namespace dnn { namespace cuda4dnn {
         };
 
         ActivationType activation_type;
-        float relu_negative_slope, crelu_floor, crelu_ceil, power_exp;
+        float relu_negative_slope, crelu_floor, crelu_ceil;
+        float power_exp, power_scale, power_shift;
     };
 
     template <class T>
@@ -224,10 +225,8 @@ namespace cv { namespace dnn { namespace cuda4dnn {
             crelu_floor = config.crelu_floor;
             crelu_ceil = config.crelu_ceil;
             power_exp = config.power_exp;
-
-            /* the scale and shift parameters of POWER have already been fused with weights and bias */
-            if (activation == ConvolutionConfiguration::ActivationType::POWER && power_exp == 1.0f)
-                activation = ConvolutionConfiguration::ActivationType::IDENTITY;
+            power_scale = config.power_scale;
+            power_shift = config.power_shift;
 
             /* we normally use cuDNN for convolution and perform bias, activation and eltwise ops ourselves
              * hence, the activation for cuDNN is IDENTITY by default
@@ -383,7 +382,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
                             kernels::biasN_eltwise_sum_2_clipped_relu_inplace<T>(stream, output, inner_size, biasTensor, eltwise, crelu_floor, crelu_ceil);
                             break;
                         case ConvolutionConfiguration::ActivationType::POWER:
-                            kernels::biasN_eltwise_sum_2_power_inplace<T>(stream, output, inner_size, biasTensor, eltwise, power_exp, 1.0, 0.0);
+                            kernels::biasN_eltwise_sum_2_power_inplace<T>(stream, output, inner_size, biasTensor, eltwise, power_exp, power_scale, power_shift);
                             break;
                         case ConvolutionConfiguration::ActivationType::TANH:
                             kernels::biasN_eltwise_sum_2_tanh_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
@@ -414,7 +413,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
                             kernels::biasN_clipped_relu_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise, crelu_floor, crelu_ceil);
                             break;
                         case ConvolutionConfiguration::ActivationType::POWER:
-                            kernels::biasN_power_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise, power_exp, 1.0, 0.0);
+                            kernels::biasN_power_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise, power_exp, power_scale, power_shift);
                             break;
                         case ConvolutionConfiguration::ActivationType::TANH:
                             kernels::biasN_tanh_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
@@ -450,7 +449,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
                             kernels::biasN_clipped_relu_inplace<T>(stream, output, inner_size, biasTensor, crelu_floor, crelu_ceil);
                             break;
                         case ConvolutionConfiguration::ActivationType::POWER:
-                            kernels::biasN_power_inplace<T>(stream, output, inner_size, biasTensor, power_exp, 1.0, 0.0);
+                            kernels::biasN_power_inplace<T>(stream, output, inner_size, biasTensor, power_exp, power_scale, power_shift);
                             break;
                         case ConvolutionConfiguration::ActivationType::TANH:
                             kernels::biasN_tanh_inplace<T>(stream, output, inner_size, biasTensor);
@@ -497,7 +496,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
                             kernels::eltwise_sum_2_clipped_relu<T>(stream, output, output, eltwise, crelu_floor, crelu_ceil);
                             break;
                         case ConvolutionConfiguration::ActivationType::POWER:
-                            kernels::eltwise_sum_2_power<T>(stream, output, output, eltwise, power_exp, 1.0, 0.0);
+                            kernels::eltwise_sum_2_power<T>(stream, output, output, eltwise, power_exp, power_scale, power_shift);
                             break;
                         case ConvolutionConfiguration::ActivationType::TANH:
                             kernels::eltwise_sum_2_tanh<T>(stream, output, output, eltwise);
@@ -527,7 +526,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
                             kernels::clipped_relu_eltwise_sum_2_inplace<T>(stream, output, eltwise, crelu_floor, crelu_ceil);
                             break;
                         case ConvolutionConfiguration::ActivationType::POWER:
-                            kernels::power_eltwise_sum_2_inplace<T>(stream, output, eltwise, power_exp, 1.0, 0.0);
+                            kernels::power_eltwise_sum_2_inplace<T>(stream, output, eltwise, power_exp, power_scale, power_shift);
                             break;
                         case ConvolutionConfiguration::ActivationType::TANH:
                             kernels::tanh_eltwise_sum_2_inplace<T>(stream, output, eltwise);
@@ -561,7 +560,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
                             kernels::clipped_relu<T>(stream, output, output, crelu_floor, crelu_ceil);
                             break;
                         case ConvolutionConfiguration::ActivationType::POWER:
-                            kernels::power<T>(stream, output, output, power_exp, 1.0, 0.0);
+                            kernels::power<T>(stream, output, output, power_exp, power_scale, power_shift);
                             break;
                         case ConvolutionConfiguration::ActivationType::TANH:
                             kernels::tanh<T>(stream, output, output);
@@ -595,7 +594,8 @@ namespace cv { namespace dnn { namespace cuda4dnn {
 
         ConvolutionConfiguration::FusionMode fusion_mode;
         ConvolutionConfiguration::ActivationType activation;
-        float relu_negative_slope, crelu_floor, crelu_ceil, power_exp;
+        float relu_negative_slope, crelu_floor, crelu_ceil;
+        float power_exp, power_scale, power_shift;
 
         enum class InternalFusionLocation {
             CUDNN,
diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp
index a04db892fc..3bff866bb8 100644
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -2656,32 +2656,21 @@ struct Net::Impl : public detail::NetImplBase
                     Ptr<EltwiseLayer> nextEltwiseLayer;
                     if( nextData )
                         nextEltwiseLayer = nextData->layerInstance.dynamicCast<EltwiseLayer>();
-
 #ifdef HAVE_CUDA
                     // CUDA backend supports fusion with eltwise sum (without variable channels)
                     // `nextEltwiseLayer` is reset if eltwise layer doesn't have a compatible configuration for fusion
                     if (IS_DNN_CUDA_TARGET(preferableTarget) && !nextEltwiseLayer.empty())
                     {
                         // we create a temporary backend node for eltwise layer to obtain the eltwise configuration
-                        cuda4dnn::csl::CSLContext context; // assume that initCUDA and EltwiseOp does not use the context during init
+                        cuda4dnn::csl::CSLContext context; // assume that initCUDA and EltwiseOp do not use the context during init
                         const auto node = nextData->layerInstance->initCUDA(&context, nextData->inputBlobsWrappers, nextData->outputBlobsWrappers);
                         const auto eltwiseNode = node.dynamicCast<cuda4dnn::EltwiseOpBase>();
-                        if (eltwiseNode->op != cuda4dnn::EltwiseOpType::SUM || !eltwiseNode->coeffs.empty())
+                        // CUDA backend uses EltwiseOp when all operands have the same number of channels; otherwise, ShortcutOp is used.
+                        // Hence, a successful cast to EltwiseOp implies that the number of channels is same in all operand tensors.
+                        if (eltwiseNode.empty() || eltwiseNode->op != cuda4dnn::EltwiseOpType::SUM || !eltwiseNode->coeffs.empty())
                             nextEltwiseLayer = Ptr<EltwiseLayer>();
-
-                        // check for variable channels
-                        auto& inputs = nextData->inputBlobs;
-                        for (int i = 1; i < inputs.size(); ++i)
-                        {
-                            if (inputs[i]->size[1] != inputs[0]->size[1])
-                            {
-                                nextEltwiseLayer = Ptr<EltwiseLayer>();
-                                break;
-                            }
-                        }
                     }
 #endif
-
                     if (!nextEltwiseLayer.empty() && nextData && nextData->inputBlobsId.size() == 2)
                     {
                         LayerData *eltwiseData = nextData;
@@ -2725,7 +2714,8 @@ struct Net::Impl : public detail::NetImplBase
                                 {
                                     nextData = &layers[eltwiseData->consumers[0].lid];
                                     lpNext = LayerPin(eltwiseData->consumers[0].lid, 0);
-                                    if (pinsToKeep.count(lpNext) == 0 && nextData->outputBlobs.size() == 1)
+                                    CV_Assert(nextData);
+                                    if (nextData->outputBlobs.size() == 1)
                                         nextFusabeleActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
                                 }
                                 else
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 2e125b5e95..52d8971a01 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -250,7 +250,8 @@ public:
 #ifdef HAVE_CUDA
     cuda4dnn::ConvolutionConfiguration::FusionMode cudaFusionMode;
     cuda4dnn::ConvolutionConfiguration::ActivationType cudaActType;
-    float cuda_relu_slope, cuda_crelu_floor, cuda_crelu_ceil, cuda_power_exp;
+    float cuda_relu_slope, cuda_crelu_floor, cuda_crelu_ceil;
+    float cuda_power_exp, cuda_power_scale, cuda_power_shift;
 #endif
 
     ConvolutionLayerImpl(const LayerParams &params) : BaseConvolutionLayerImpl(params)
@@ -457,13 +458,8 @@ public:
             Ptr<PowerLayer> activ_power = activ.dynamicCast<PowerLayer>();
             if (!activ_power.empty())
             {
-                if (activ_power->scale != 1.f || activ_power->shift != 0.f)
-                {
-                    const int outCh = blobs[0].size[0];
-                    fuseWeights(Mat(1, outCh, CV_32F, Scalar(activ_power->scale)),
-                                Mat(1, outCh, CV_32F, Scalar(activ_power->shift)));
-                }
-
+                cuda_power_scale = activ_power->scale;
+                cuda_power_shift = activ_power->shift;
                 cuda_power_exp = activ_power->power;
                 cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::POWER;
             }
@@ -1591,6 +1587,8 @@ public:
         config.crelu_floor = cuda_crelu_floor;
         config.crelu_ceil = cuda_crelu_ceil;
         config.power_exp = cuda_power_exp;
+        config.power_scale = cuda_power_scale;
+        config.power_shift = cuda_power_shift;
 
         Mat filtersMat = fusedWeights ? weightsMat : blobs[0];
         Mat biasMat = (hasBias() || fusedBias) ? Mat(output_feature_maps, 1, CV_32F, biasvec.data()) : Mat();