From 71a22e45b0448aa9dc496012ac43326151a6d40b Mon Sep 17 00:00:00 2001
From: Smirnov Egor <s.e.a.98@yandex.ru>
Date: Fri, 3 Dec 2021 23:17:07 +0300
Subject: [PATCH] add celu, hardsigmoid, selu, thresholdedrelu layers

---
 .../dnn/include/opencv2/dnn/all_layers.hpp    |  34 +++
 modules/dnn/src/cuda/activations.cu           |  28 ++
 modules/dnn/src/cuda/functors.hpp             |  78 +++++
 .../dnn/src/cuda4dnn/kernels/activations.hpp  |  12 +
 .../src/cuda4dnn/primitives/activation.hpp    |  62 ++++
 modules/dnn/src/init.cpp                      |   4 +
 modules/dnn/src/layers/elementwise_layers.cpp | 288 +++++++++++++-----
 .../dnn/src/onnx/onnx_graph_simplifier.cpp    |  65 ++++
 modules/dnn/src/opencl/activations.cl         |  34 +++
 ..._conformance_layer_parser_denylist.inl.hpp |  10 -
 10 files changed, 526 insertions(+), 89 deletions(-)
diff --git a/modules/dnn/include/opencv2/dnn/all_layers.hpp b/modules/dnn/include/opencv2/dnn/all_layers.hpp
index 26d7a9b069..44b16f7800 100644
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@@ -738,6 +738,40 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<TanLayer> create(const LayerParams &params);
     };
 
+    class CV_EXPORTS CeluLayer : public ActivationLayer
+    {
+    public:
+        float alpha;
+
+        static Ptr<CeluLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS HardSigmoidLayer : public ActivationLayer
+    {
+    public:
+        float alpha;
+        float beta;
+
+        static Ptr<HardSigmoidLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS SeluLayer : public ActivationLayer
+    {
+    public:
+        float alpha;
+        float gamma;
+
+        static Ptr<SeluLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS ThresholdedReluLayer : public ActivationLayer
+    {
+    public:
+        float alpha;
+
+        static Ptr<ThresholdedReluLayer> create(const LayerParams &params);
+    };
+
     class CV_EXPORTS ActivationLayerInt8 : public ActivationLayer
     {
     public:
diff --git a/modules/dnn/src/cuda/activations.cu b/modules/dnn/src/cuda/activations.cu
index 3d99a03ae3..f5dafcea7f 100644
--- a/modules/dnn/src/cuda/activations.cu
+++ b/modules/dnn/src/cuda/activations.cu
@@ -233,6 +233,26 @@ void tan(const Stream& stream, Span<T> output, View<T> input) {
     generic_op<T, TanFunctor<T>>(stream, output, input);
 }
 
+template <class T>
+void celu(const Stream& stream, Span<T> output, View<T> input, T alpha) {
+    generic_op<T, CeluFunctor<T>>(stream, output, input, {alpha});
+}
+
+template <class T>
+void hardsigmoid(const Stream& stream, Span<T> output, View<T> input, T alpha, T beta) {
+    generic_op<T, HardSigmoidFunctor<T>>(stream, output, input, {alpha, beta});
+}
+
+template <class T>
+void selu(const Stream& stream, Span<T> output, View<T> input, T alpha, T gamma) {
+    generic_op<T, SeluFunctor<T>>(stream, output, input, {alpha, gamma});
+}
+
+template <class T>
+void thresholdedrelu(const Stream& stream, Span<T> output, View<T> input, T alpha) {
+    generic_op<T, ThresholdedReluFunctor<T>>(stream, output, input, {alpha});
+}
+
 template <class T>
 void abs(const Stream& stream, Span<T> output, View<T> input) {
     generic_op<T, AbsFunctor<T>>(stream, output, input);
@@ -286,6 +306,10 @@ template void sinh<__half>(const Stream&, Span<__half>, View<__half>);
 template void softplus<__half>(const Stream&, Span<__half>, View<__half>);
 template void softsign<__half>(const Stream&, Span<__half>, View<__half>);
 template void tan<__half>(const Stream&, Span<__half>, View<__half>);
+template void celu<__half>(const Stream&, Span<__half>, View<__half>, __half);
+template void hardsigmoid<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
+template void selu<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
+template void thresholdedrelu<__half>(const Stream&, Span<__half>, View<__half>, __half);
 template void power<__half>(const Stream&, Span<__half>, View<__half>, __half, __half, __half);
 template void exp<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
 #endif
@@ -321,6 +345,10 @@ template void sinh<float>(const Stream&, Span<float>, View<float>);
 template void softplus<float>(const Stream&, Span<float>, View<float>);
 template void softsign<float>(const Stream&, Span<float>, View<float>);
 template void tan<float>(const Stream&, Span<float>, View<float>);
+template void celu<float>(const Stream&, Span<float>, View<float>, float);
+template void hardsigmoid<float>(const Stream&, Span<float>, View<float>, float, float);
+template void selu<float>(const Stream&, Span<float>, View<float>, float, float);
+template void thresholdedrelu<float>(const Stream&, Span<float>, View<float>, float);
 template void power<float>(const Stream&, Span<float>, View<float>, float, float, float);
 template void exp<float>(const Stream&, Span<float>, View<float>, float, float);
 
diff --git a/modules/dnn/src/cuda/functors.hpp b/modules/dnn/src/cuda/functors.hpp
index c3d1669344..640c7c8ad6 100644
--- a/modules/dnn/src/cuda/functors.hpp
+++ b/modules/dnn/src/cuda/functors.hpp
@@ -528,6 +528,84 @@ struct TanFunctor {
     }
 };
 
+template <class T>
+struct CeluFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() : alpha(1) { }
+        CUDA4DNN_HOST_DEVICE Params(T alpha_) : alpha(alpha_) { }
+        T alpha;
+    };
+
+    CUDA4DNN_DEVICE CeluFunctor() : CeluFunctor(Params{}) { }
+    CUDA4DNN_DEVICE CeluFunctor(const Params& params) : alpha{params.alpha} { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        using csl::device::min;
+        using csl::device::max;
+        using csl::device::expm1;
+        return max(T(0), value) + min(T(0), alpha * expm1(value / alpha));
+    }
+
+    T alpha;
+};
+
+template <class T>
+struct HardSigmoidFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() : alpha(0.2), beta(0.5) { }
+        CUDA4DNN_HOST_DEVICE Params(T alpha_, T beta_) : alpha(alpha_), beta(beta_) { }
+        T alpha, beta;
+    };
+
+    CUDA4DNN_DEVICE HardSigmoidFunctor() : HardSigmoidFunctor(Params{}) { }
+    CUDA4DNN_DEVICE HardSigmoidFunctor(const Params& params): alpha{params.alpha}, beta{params.beta} { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        using csl::device::clamp;
+        return clamp(alpha * value + beta, T(0), T(1));
+    }
+
+    T alpha, beta;
+};
+
+template <class T>
+struct SeluFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() : alpha(1.6732632423543772848170429916717),
+                                        gamma(1.0507009873554804934193349852946) { }
+        CUDA4DNN_HOST_DEVICE Params(T alpha_, T gamma_) : alpha(alpha_), gamma(gamma_) { }
+        T alpha, gamma;
+    };
+
+    CUDA4DNN_DEVICE SeluFunctor() : SeluFunctor(Params{}) { }
+    CUDA4DNN_DEVICE SeluFunctor(const Params& params): alpha{params.alpha}, gamma{params.gamma} { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        using csl::device::expm1;
+        return gamma * (value > T(0) ? value : alpha * expm1(value));
+    }
+
+    T alpha, gamma;
+};
+
+template <class T>
+struct ThresholdedReluFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() : alpha(1) { }
+        CUDA4DNN_HOST_DEVICE Params(T alpha_) : alpha(alpha_) { }
+        T alpha;
+    };
+
+    CUDA4DNN_DEVICE ThresholdedReluFunctor() : ThresholdedReluFunctor(Params{}) { }
+    CUDA4DNN_DEVICE ThresholdedReluFunctor(const Params& params) : alpha{params.alpha} { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        return (value > alpha) ? value : T(0);
+    }
+
+    T alpha;
+};
+
 template <class T>
 struct PowerFunctor {
     struct Params {
diff --git a/modules/dnn/src/cuda4dnn/kernels/activations.hpp b/modules/dnn/src/cuda4dnn/kernels/activations.hpp
index 854bc8ac0c..ef1f6da3e6 100644
--- a/modules/dnn/src/cuda4dnn/kernels/activations.hpp
+++ b/modules/dnn/src/cuda4dnn/kernels/activations.hpp
@@ -105,6 +105,18 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
     template <class T>
     void tan(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
 
+    template <class T>
+    void celu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T alpha);
+
+    template <class T>
+    void hardsigmoid(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T alpha, T beta);
+
+    template <class T>
+    void selu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T alpha, T gamma);
+
+    template <class T>
+    void thresholdedrelu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T alpha);
+
     template <class T>
     void power(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T exp, T scale, T shift);
 
diff --git a/modules/dnn/src/cuda4dnn/primitives/activation.hpp b/modules/dnn/src/cuda4dnn/primitives/activation.hpp
index 4691996d4e..39ebf513a7 100644
--- a/modules/dnn/src/cuda4dnn/primitives/activation.hpp
+++ b/modules/dnn/src/cuda4dnn/primitives/activation.hpp
@@ -490,6 +490,68 @@ namespace cv { namespace dnn { namespace cuda4dnn {
         csl::Stream stream;
     };
 
+    template <class T>
+    class CeluOp final : public BaseOp<CeluOp, T> {
+    public:
+        CeluOp(csl::Stream stream_, T alpha_) : stream(std::move(stream_)), alpha{ alpha_ } { }
+
+        void calculate(csl::TensorSpan<T> output, csl::TensorView<T> input) const
+        {
+            kernels::celu<T>(stream, output, input, alpha);
+        }
+
+    private:
+        csl::Stream stream;
+        const T alpha;
+    };
+
+    template <class T>
+    class HardSigmoidOp final : public BaseOp<HardSigmoidOp, T> {
+    public:
+        HardSigmoidOp(csl::Stream stream_, T alpha_, T beta_)
+            : stream(std::move(stream_)), alpha{ alpha_ }, beta{ beta_ } { }
+
+        void calculate(csl::TensorSpan<T> output, csl::TensorView<T> input) const
+        {
+            kernels::hardsigmoid<T>(stream, output, input, alpha, beta);
+        }
+
+    private:
+        csl::Stream stream;
+        const T alpha, beta;
+    };
+
+    template <class T>
+    class SeluOp final : public BaseOp<SeluOp, T> {
+    public:
+        SeluOp(csl::Stream stream_, T alpha_, T gamma_)
+            : stream(std::move(stream_)), alpha{ alpha_ }, gamma{ gamma_ } { }
+
+        void calculate(csl::TensorSpan<T> output, csl::TensorView<T> input) const
+        {
+            kernels::selu<T>(stream, output, input, alpha, gamma);
+        }
+
+    private:
+        csl::Stream stream;
+        const T alpha, gamma;
+    };
+
+    template <class T>
+    class ThresholdedReluOp final : public BaseOp<ThresholdedReluOp, T> {
+    public:
+        ThresholdedReluOp(csl::Stream stream_, T alpha_) : stream(std::move(stream_)), alpha{ alpha_ } { }
+
+        void calculate(csl::TensorSpan<T> output, csl::TensorView<T> input) const
+        {
+            kernels::thresholdedrelu<T>(stream, output, input, alpha);
+        }
+
+    private:
+        csl::Stream stream;
+        const T alpha;
+    };
+
     template <class T>
     class PowerOp final : public BaseOp<PowerOp, T> {
     public:
diff --git a/modules/dnn/src/init.cpp b/modules/dnn/src/init.cpp
index 89a91e17ae..55ed1e5d17 100644
--- a/modules/dnn/src/init.cpp
+++ b/modules/dnn/src/init.cpp
@@ -132,6 +132,10 @@ void initializeLayerFactory()
     CV_DNN_REGISTER_LAYER_CLASS(Softplus,       SoftplusLayer);
     CV_DNN_REGISTER_LAYER_CLASS(Softsign,       SoftsignLayer);
     CV_DNN_REGISTER_LAYER_CLASS(Tan,            TanLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(Celu,           CeluLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(HardSigmoid,    HardSigmoidLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(Selu,           SeluLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(ThresholdedRelu,ThresholdedReluLayer);
     CV_DNN_REGISTER_LAYER_CLASS(BatchNorm,      BatchNormLayer);
     CV_DNN_REGISTER_LAYER_CLASS(MaxUnpool,      MaxUnpoolLayer);
     CV_DNN_REGISTER_LAYER_CLASS(Dropout,        BlankLayer);
diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp
index 772dfca602..bfabef9d68 100644
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@@ -71,6 +71,7 @@ namespace dnn
 
 using std::abs;
 using std::exp;
+using std::expm1;
 using std::tanh;
 using std::pow;
 using std::ceil;
@@ -728,6 +729,20 @@ struct BaseDefaultFunctor : public BaseFunctor
         return true;
     }
 
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(int target, csl::Stream stream)
+    {
+        CV_Error(Error::StsNotImplemented, "");
+    }
+#endif
+
+#ifdef HAVE_HALIDE
+    void attachHalide(const Halide::Expr& input, Halide::Func& top)
+    {
+        CV_Error(Error::StsNotImplemented, "");
+    }
+#endif  // HAVE_HALIDE
+
 #ifdef HAVE_DNN_IE_NN_BUILDER_2019
     InferenceEngine::Builder::Layer initInfEngineBuilderAPI()
     {
@@ -746,8 +761,6 @@ struct BaseDefaultFunctor : public BaseFunctor
     ml::Operand initWebnnAPI(const ml::GraphBuilder& builder, const ml::Operand& input)
     {
         CV_Error(Error::StsNotImplemented, "");
-        ml::Operand operand;
-        return operand;
     }
 #endif
 
@@ -759,20 +772,6 @@ struct BaseDefaultFunctor : public BaseFunctor
     }
 #endif  // HAVE_VULKAN
 
-#ifdef HAVE_CUDA
-    Ptr<BackendNode> initCUDA(int target, csl::Stream stream)
-    {
-        CV_Error(Error::StsNotImplemented, "");
-    }
-#endif
-
-#ifdef HAVE_HALIDE
-    void attachHalide(const Halide::Expr& input, Halide::Func& top)
-    {
-        CV_Error(Error::StsNotImplemented, "");
-    }
-#endif  // HAVE_HALIDE
-
 private:
     static const char* const ocl_kernel_name;
 };
@@ -823,15 +822,6 @@ struct TanHFunctor : public BaseDefaultFunctor<TanHFunctor>
     }
 #endif  // HAVE_DNN_NGRAPH
 
-#ifdef HAVE_WEBNN
-    ml::Operand initWebnnAPI(const ml::GraphBuilder& builder, const ml::Operand& input)
-    {
-        CV_Error(Error::StsNotImplemented, "");
-        ml::Operand operand;
-        return operand;
-    }
-#endif
-
     int64 getFLOPSPerElement() const { return 1; }
 };
 
@@ -935,15 +925,6 @@ struct MishFunctor : public BaseDefaultFunctor<MishFunctor>
     }
 #endif  // HAVE_DNN_NGRAPH
 
-#ifdef HAVE_WEBNN
-    ml::Operand initWebnnAPI(const ml::GraphBuilder& builder, const ml::Operand& input)
-    {
-        CV_Error(Error::StsNotImplemented, "");
-        ml::Operand operand;
-        return operand;
-    }
-#endif
-
     int64 getFLOPSPerElement() const { return 3; }
 };
 
@@ -996,15 +977,6 @@ struct SigmoidFunctor : public BaseDefaultFunctor<SigmoidFunctor>
     }
 #endif  // HAVE_DNN_NGRAPH
 
-#ifdef HAVE_WEBNN
-    ml::Operand initWebnnAPI(const ml::GraphBuilder& builder, const ml::Operand& input)
-    {
-        CV_Error(Error::StsNotImplemented, "");
-        ml::Operand operand;
-        return operand;
-    }
-#endif
-
     int64 getFLOPSPerElement() const { return 3; }
 };
 
@@ -1123,15 +1095,6 @@ struct AbsValFunctor : public BaseDefaultFunctor<AbsValFunctor>
     }
 #endif  // HAVE_DNN_NGRAPH
 
-#ifdef HAVE_WEBNN
-    ml::Operand initWebnnAPI(const ml::GraphBuilder& builder, const ml::Operand& input)
-    {
-        CV_Error(Error::StsNotImplemented, "");
-        ml::Operand operand;
-        return operand;
-    }
-#endif
-
     int64 getFLOPSPerElement() const { return 1; }
 };
 
@@ -1261,15 +1224,6 @@ struct LogFunctor : public BaseDefaultFunctor<LogFunctor>
         return log(x);
     }
 
-#ifdef HAVE_WEBNN
-    ml::Operand initWebnnAPI(const ml::GraphBuilder& builder, const ml::Operand& input)
-    {
-        CV_Error(Error::StsNotImplemented, "");
-        ml::Operand operand;
-        return operand;
-    }
-#endif
-
 #ifdef HAVE_CUDA
     Ptr<BackendNode> initCUDA(int target, csl::Stream stream)
     {
@@ -1367,15 +1321,6 @@ struct SqrtFunctor : public BaseDefaultFunctor<SqrtFunctor>
     }
 #endif  // HAVE_DNN_NGRAPH
 
-#ifdef HAVE_WEBNN
-    ml::Operand initWebnnAPI(const ml::GraphBuilder& builder, const ml::Operand& input)
-    {
-        CV_Error(Error::StsNotImplemented, "");
-        ml::Operand operand;
-        return operand;
-    }
-#endif
-
     int64 getFLOPSPerElement() const { return 1; }
 };
 
@@ -1822,6 +1767,156 @@ struct TanFunctor : public BaseDefaultFunctor<TanFunctor>
 template<>
 const char* const BaseDefaultFunctor<TanFunctor>::ocl_kernel_name = "TanForward";
 
+struct CeluFunctor : public BaseDefaultFunctor<CeluFunctor>
+{
+    typedef CeluLayer Layer;
+
+    float alpha;
+
+    explicit CeluFunctor(float alpha_ = 1.f) : alpha(alpha_) {}
+
+    bool supportBackend(int backendId, int)
+    {
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_CUDA;
+    }
+
+    inline float calculate(float x) const
+    {
+        return max(0.f, x) + min(0.f, alpha * expm1(x / alpha));
+    }
+
+    inline void setKernelParams(ocl::Kernel& kernel) const
+    {
+        kernel.set(3, alpha);
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(int target, csl::Stream stream)
+    {
+        return make_cuda_node<cuda4dnn::CeluOp>(target, stream, alpha);
+    }
+#endif
+
+    int64 getFLOPSPerElement() const { return 1; }
+};
+
+template<>
+const char* const BaseDefaultFunctor<CeluFunctor>::ocl_kernel_name = "CeluForward";
+
+struct HardSigmoidFunctor : public BaseDefaultFunctor<HardSigmoidFunctor>
+{
+    typedef HardSigmoidLayer Layer;
+
+    float alpha;
+    float beta;
+
+    explicit HardSigmoidFunctor(float alpha_ = 0.2f, float beta_ = 0.5f) : alpha(alpha_), beta(beta_) {}
+
+    bool supportBackend(int backendId, int)
+    {
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_CUDA;
+    }
+
+    inline float calculate(float x) const
+    {
+        return max(0.f, min(1.f, alpha * x + beta));
+    }
+
+    inline void setKernelParams(ocl::Kernel& kernel) const
+    {
+        kernel.set(3, alpha);
+        kernel.set(4, beta);
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(int target, csl::Stream stream)
+    {
+        return make_cuda_node<cuda4dnn::HardSigmoidOp>(target, stream, alpha, beta);
+    }
+#endif
+
+    int64 getFLOPSPerElement() const { return 1; }
+};
+
+template<>
+const char* const BaseDefaultFunctor<HardSigmoidFunctor>::ocl_kernel_name = "HardSigmoidForward";
+
+struct SeluFunctor : public BaseDefaultFunctor<SeluFunctor>
+{
+    typedef SeluLayer Layer;
+
+    float alpha;
+    float gamma;
+
+    explicit SeluFunctor(float alpha_ = 1.67326319217681884765625f,
+                         float gamma_ = 1.05070102214813232421875f) : alpha(alpha_), gamma(gamma_) {}
+
+    bool supportBackend(int backendId, int)
+    {
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_CUDA;
+    }
+
+    inline float calculate(float x) const
+    {
+        return gamma * (x > 0.f ? x : alpha * expm1(x));
+    }
+
+    inline void setKernelParams(ocl::Kernel& kernel) const
+    {
+        kernel.set(3, alpha);
+        kernel.set(4, gamma);
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(int target, csl::Stream stream)
+    {
+        return make_cuda_node<cuda4dnn::SeluOp>(target, stream, alpha, gamma);
+    }
+#endif
+
+    int64 getFLOPSPerElement() const { return 1; }
+};
+
+template<>
+const char* const BaseDefaultFunctor<SeluFunctor>::ocl_kernel_name = "SeluForward";
+
+struct ThresholdedReluFunctor : public BaseDefaultFunctor<ThresholdedReluFunctor>
+{
+    typedef ThresholdedReluLayer Layer;
+
+    float alpha;
+
+    explicit ThresholdedReluFunctor(float alpha_ = 1.f) : alpha(alpha_) {}
+
+
+    bool supportBackend(int backendId, int)
+    {
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_CUDA;
+    }
+
+    inline float calculate(float x) const
+    {
+        return x > alpha ? x : 0.f;
+    }
+
+    inline void setKernelParams(ocl::Kernel& kernel) const
+    {
+        kernel.set(3, alpha);
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(int target, csl::Stream stream)
+    {
+        return make_cuda_node<cuda4dnn::ThresholdedReluOp>(target, stream, alpha);
+    }
+#endif
+
+    int64 getFLOPSPerElement() const { return 1; }
+};
+
+template<>
+const char* const BaseDefaultFunctor<ThresholdedReluFunctor>::ocl_kernel_name = "ThresholdedReluForward";
+
 struct PowerFunctor : public BaseFunctor
 {
     typedef PowerLayer Layer;
@@ -2074,15 +2169,6 @@ struct ExpFunctor : public BaseDefaultFunctor<ExpFunctor>
     }
 #endif  // HAVE_DNN_NGRAPH
 
-#ifdef HAVE_WEBNN
-    ml::Operand initWebnnAPI(const ml::GraphBuilder& builder, const ml::Operand& input)
-    {
-        CV_Error(Error::StsNotImplemented, "");
-        ml::Operand operand;
-        return operand;
-    }
-#endif
-
     int64 getFLOPSPerElement() const { return 3; }
 };
 
@@ -2489,6 +2575,50 @@ Ptr<TanLayer> TanLayer::create(const LayerParams& params)
     return l;
 }
 
+Ptr<CeluLayer> CeluLayer::create(const LayerParams& params)
+{
+    float alpha = params.get<float>("alpha", 1.f);
+    Ptr<CeluLayer> l(new ElementWiseLayer<CeluFunctor>(CeluFunctor(alpha)));
+    l->setParamsFrom(params);
+    l->alpha = alpha;
+
+    return l;
+}
+
+Ptr<HardSigmoidLayer> HardSigmoidLayer::create(const LayerParams& params)
+{
+    float alpha = params.get<float>("alpha", 0.2f);
+    float beta = params.get<float>("beta", 0.5f);
+    Ptr<HardSigmoidLayer> l(new ElementWiseLayer<HardSigmoidFunctor>(HardSigmoidFunctor(alpha, beta)));
+    l->setParamsFrom(params);
+    l->alpha = alpha;
+    l->beta = beta;
+
+    return l;
+}
+
+Ptr<SeluLayer> SeluLayer::create(const LayerParams& params)
+{
+    float alpha = params.get<float>("alpha", 1.67326319217681884765625f);
+    float gamma = params.get<float>("gamma", 1.05070102214813232421875f);
+    Ptr<SeluLayer> l(new ElementWiseLayer<SeluFunctor>(SeluFunctor(alpha, gamma)));
+    l->setParamsFrom(params);
+    l->alpha = alpha;
+    l->gamma = gamma;
+
+    return l;
+}
+
+Ptr<ThresholdedReluLayer> ThresholdedReluLayer::create(const LayerParams& params)
+{
+    float alpha = params.get<float>("alpha", 1.f);
+    Ptr<ThresholdedReluLayer> l(new ElementWiseLayer<ThresholdedReluFunctor>(ThresholdedReluFunctor(alpha)));
+    l->setParamsFrom(params);
+    l->alpha = alpha;
+
+    return l;
+}
+
 Ptr<PowerLayer> PowerLayer::create(const LayerParams& params)
 {
     float power = params.get<float>("power", 1.0f);
diff --git a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
index 81a5df1a28..80fe0b173e 100644
--- a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
+++ b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
@@ -242,6 +242,70 @@ public:
     }
 };
 
+class CeluSubgraph : public Subgraph
+{
+public:
+    CeluSubgraph() : alpha(1.f)
+    {
+        int input = addNodeToMatch("");
+        int div = addNodeToMatch("Div", input, addNodeToMatch(""));
+        int elu = addNodeToMatch("Elu", div);
+        addNodeToMatch("Mul", addNodeToMatch(""), elu);
+        setFusedNode("Celu", input);
+    }
+
+    static float extractAlpha(const Ptr<ImportGraphWrapper>& net, int node_id, int input_id)
+    {
+        const Ptr<ImportNodeWrapper> node = net->getNode(node_id);
+        int const_id = getInputNodeId(net, node, input_id);
+        Ptr<ImportNodeWrapper> alpha_ptr = net->getNode(const_id);
+        opencv_onnx::NodeProto* alpha_node = alpha_ptr.dynamicCast<ONNXNodeWrapper>()->node;
+        opencv_onnx::TensorProto alpha_proto = alpha_node->attribute(0).t();
+        Mat alpha_mat = getMatFromTensor(alpha_proto);
+        return *alpha_mat.ptr<float>();
+    }
+
+    virtual bool match(const Ptr<ImportGraphWrapper>& net, int nodeId,
+                       std::vector<int>& matchedNodesIds,
+                       std::vector<int>& targetNodesIds) CV_OVERRIDE
+    {
+        if (Subgraph::match(net, nodeId, matchedNodesIds, targetNodesIds))
+        {
+            float alpha_div = extractAlpha(net, matchedNodesIds[0], 1);
+            float alpha_mul = extractAlpha(net, matchedNodesIds[2], 0);
+            float alpha_elu = 1.f;
+
+            Ptr<ImportNodeWrapper> elu_ptr = net->getNode(matchedNodesIds[1]);
+            opencv_onnx::NodeProto* elu_node = elu_ptr.dynamicCast<ONNXNodeWrapper>()->node;
+
+            for (int i = 0; i < elu_node->attribute_size(); i++)
+            {
+                opencv_onnx::AttributeProto attr = elu_node->attribute(i);
+                if (attr.name() != "alpha")
+                    continue;
+                alpha_elu = attr.f();
+            }
+
+            alpha = alpha_div;
+            return alpha_elu == 1.f && alpha_div == alpha_mul;
+        }
+        return false;
+    }
+
+    virtual void finalize(const Ptr<ImportGraphWrapper>&,
+                          const Ptr<ImportNodeWrapper>& fusedNode,
+                          std::vector<Ptr<ImportNodeWrapper> >&) CV_OVERRIDE
+    {
+        opencv_onnx::NodeProto* node = fusedNode.dynamicCast<ONNXNodeWrapper>()->node;
+        opencv_onnx::AttributeProto* alpha_attr = node->add_attribute();
+        alpha_attr->set_name("alpha");
+        alpha_attr->set_f(alpha);
+    }
+
+protected:
+    float alpha;
+};
+
 class NormalizeSubgraphBase : public Subgraph
 {
 public:
@@ -662,6 +726,7 @@ void simplifySubgraphs(opencv_onnx::GraphProto& net)
     subgraphs.push_back(makePtr<SoftMaxSubgraph2>());
     subgraphs.push_back(makePtr<LogSoftMaxSubgraph>());
     subgraphs.push_back(makePtr<HardSwishSubgraph>());
+    subgraphs.push_back(makePtr<CeluSubgraph>());
     subgraphs.push_back(makePtr<NormalizeSubgraph1>());
     subgraphs.push_back(makePtr<NormalizeSubgraph2>());
     subgraphs.push_back(makePtr<NormalizeSubgraph2_2>());
diff --git a/modules/dnn/src/opencl/activations.cl b/modules/dnn/src/opencl/activations.cl
index 02ed9345c3..040ee20d8a 100644
--- a/modules/dnn/src/opencl/activations.cl
+++ b/modules/dnn/src/opencl/activations.cl
@@ -272,3 +272,37 @@ __kernel void TanForward(const int n, __global T* in, __global T* out) {
     if(index < n)
         out[index] = tan(in[index]);
 }
+
+__kernel void CeluForward(const int n, __global T* in, __global T* out,
+                          const KERNEL_ARG_DTYPE alpha)
+{
+    int index = get_global_id(0);
+    if(index < n)
+        out[index] = max(0.f, in[index]) + min(0.f, alpha * expm1(in[index] / alpha));
+}
+
+__kernel void HardSigmoidForward(const int n, __global T* in, __global T* out,
+                                 const KERNEL_ARG_DTYPE alpha,
+                                 const KERNEL_ARG_DTYPE beta)
+{
+    int index = get_global_id(0);
+    if(index < n)
+        out[index] = max(0.f, min(1.f, alpha * in[index] + beta));
+}
+
+__kernel void SeluForward(const int n, __global T* in, __global T* out,
+                          const KERNEL_ARG_DTYPE alpha,
+                          const KERNEL_ARG_DTYPE gamma)
+{
+    int index = get_global_id(0);
+    if(index < n)
+        out[index] = gamma * (in[index] > 0.f ? in[index] : alpha * expm1(in[index]));
+}
+
+__kernel void ThresholdedReluForward(const int n, __global T* in, __global T* out,
+                                     const KERNEL_ARG_DTYPE alpha)
+{
+    int index = get_global_id(0);
+    if(index < n)
+        out[index] = (in[index] > alpha ? in[index] : 0.f);
+}
diff --git a/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp b/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
index a69ace0d14..e5d0ead9da 100644
--- a/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
@@ -57,7 +57,6 @@
 "test_castlike_FLOAT_to_FLOAT16_expanded",
 "test_castlike_FLOAT_to_STRING",
 "test_castlike_STRING_to_FLOAT",
-"test_celu",
 "test_clip",
 "test_clip_default_inbounds",
 "test_clip_default_int8_inbounds",
@@ -173,9 +172,6 @@
 "test_hardmax_example",
 "test_hardmax_negative_axis",
 "test_hardmax_one_hot",
-"test_hardsigmoid",
-"test_hardsigmoid_default",
-"test_hardsigmoid_example",
 "test_identity_opt",
 "test_identity_sequence",
 "test_if",
@@ -524,9 +520,6 @@
 "test_sce_sum_expanded",
 "test_sce_sum_log_prob",
 "test_sce_sum_log_prob_expanded",
-"test_selu",
-"test_selu_default",
-"test_selu_example",
 "test_sequence_insert_at_back",
 "test_sequence_insert_at_front",
 "test_shape",
@@ -579,9 +572,6 @@
 "test_tfidfvectorizer_tf_onlybigrams_levelempty",
 "test_tfidfvectorizer_tf_onlybigrams_skip5",
 "test_tfidfvectorizer_tf_uniandbigrams_skip5",
-"test_thresholdedrelu",
-"test_thresholdedrelu_default",
-"test_thresholdedrelu_example",
 "test_tile",
 "test_tile_precomputed",
 "test_top_k",