From 23b244d3a336111c5205c1514b8ef263fac7d808 Mon Sep 17 00:00:00 2001
From: Yuantao Feng <yuantao.feng@outlook.com>
Date: Fri, 19 Jul 2024 21:03:19 +0800
Subject: [PATCH] Merge pull request #25881 from
 fengyuentau:dnn/cpu/optimize_activations_with_v_exp

dnn: optimize activations with v_exp #25881

Merge with https://github.com/opencv/opencv_extra/pull/1191.

This PR optimizes the following activations:

- [x] Swish
- [x] Mish
- [x] Elu
- [x] Celu
- [x] Selu
- [x] HardSwish

### Performance (Updated on 2024-07-18)

#### AmLogic A311D2 (ARM Cortex A73 + A53)

```
Geometric mean (ms)

            Name of Test              activations activations.patch activations.patch
                                                                              vs
                                                                         activations
                                                                          (x-factor)
Celu::Layer_Elementwise::OCV/CPU        115.859          27.930              4.15
Elu::Layer_Elementwise::OCV/CPU          27.846          27.003              1.03
Gelu::Layer_Elementwise::OCV/CPU         0.657           0.602               1.09
HardSwish::Layer_Elementwise::OCV/CPU    31.885          6.781               4.70
Mish::Layer_Elementwise::OCV/CPU         35.729          32.089              1.11
Selu::Layer_Elementwise::OCV/CPU         61.955          27.850              2.22
Swish::Layer_Elementwise::OCV/CPU        30.819          26.688              1.15
```

#### Apple M1

```
Geometric mean (ms)

               Name of Test                activations activations.patch activations.patch
                                                                                   vs
                                                                              activations
                                                                               (x-factor)
Celu::Layer_Elementwise::OCV/CPU              16.184          2.118               7.64
Celu::Layer_Elementwise::OCV/CPU_FP16         16.280          2.123               7.67
Elu::Layer_Elementwise::OCV/CPU               9.123           1.878               4.86
Elu::Layer_Elementwise::OCV/CPU_FP16          9.085           1.897               4.79
Gelu::Layer_Elementwise::OCV/CPU              0.089           0.081               1.11
Gelu::Layer_Elementwise::OCV/CPU_FP16         0.086           0.074               1.17
HardSwish::Layer_Elementwise::OCV/CPU         1.560           1.555               1.00
HardSwish::Layer_Elementwise::OCV/CPU_FP16    1.536           1.523               1.01
Mish::Layer_Elementwise::OCV/CPU              6.077           2.476               2.45
Mish::Layer_Elementwise::OCV/CPU_FP16         5.990           2.496               2.40
Selu::Layer_Elementwise::OCV/CPU              11.351          1.976               5.74
Selu::Layer_Elementwise::OCV/CPU_FP16         11.533          1.985               5.81
Swish::Layer_Elementwise::OCV/CPU             4.687           1.890               2.48
Swish::Layer_Elementwise::OCV/CPU_FP16        4.715           1.873               2.52
```

#### Intel i7-12700K

```
Geometric mean (ms)

            Name of Test              activations activations.patch activations.patch
                                                                    vs
                                                               activations
                                                                (x-factor)
Celu::Layer_Elementwise::OCV/CPU        17.106       3.560         4.81
Elu::Layer_Elementwise::OCV/CPU          5.064       3.478         1.46
Gelu::Layer_Elementwise::OCV/CPU         0.036       0.035         1.04
HardSwish::Layer_Elementwise::OCV/CPU    2.914       2.893         1.01
Mish::Layer_Elementwise::OCV/CPU         3.820       3.529         1.08
Selu::Layer_Elementwise::OCV/CPU        10.799       3.593         3.01
Swish::Layer_Elementwise::OCV/CPU        3.651       3.473         1.05
```

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [x] The feature is well documented and sample code can be built with the project CMake
---
 modules/dnn/perf/perf_layer.cpp               |  93 ++++---
 modules/dnn/src/layers/elementwise_layers.cpp | 243 ++++++++++++++++--
 modules/dnn/test/test_onnx_conformance.cpp    |   8 +
 ...conformance_layer_filter__openvino.inl.hpp |  16 ++
 ..._conformance_layer_parser_denylist.inl.hpp |   6 +
 5 files changed, 306 insertions(+), 60 deletions(-)

diff --git a/modules/dnn/perf/perf_layer.cpp b/modules/dnn/perf/perf_layer.cpp
index ea1e70ae30..98adc56ffb 100644
--- a/modules/dnn/perf/perf_layer.cpp
+++ b/modules/dnn/perf/perf_layer.cpp
@@ -975,49 +975,72 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_Softmax, Combine(
                           /* withCann= */            false) // only test on CPU
 ));
 
-using Layer_Elementwise = TestBaseWithParam<tuple<std::vector<int>, std::string, tuple<Backend, Target>>>;
-PERF_TEST_P_(Layer_Elementwise, elementwise) {
-    std::vector<int> input_shape = get<0>(GetParam());
-    std::string op = get<1>(GetParam());
-    int backend_id = get<0>(get<2>(GetParam()));
-    int target_id = get<1>(get<2>(GetParam()));
+struct Layer_Elementwise : public TestBaseWithParam<tuple<Backend, Target>> {
+    void test_layer(const std::string &op_type, const std::vector<int> &input_shape) {
+        int backend_id = get<0>(GetParam());
+        int target_id = get<1>(GetParam());
 
-    Mat input(input_shape, CV_32F);
-    randn(input, 0.f, 1.f);
+        Mat input(input_shape, CV_32F);
+        randu(input, -10.0f, 10.f);
 
-    LayerParams lp;
-    lp.type = op;
-    lp.name = "TestLayer";
+        LayerParams lp;
+        lp.type = op_type;
+        lp.name = cv::format("PerfLayer/%s", op_type.c_str());
 
-    Net net;
-    net.addLayerToPrev(lp.name, lp.type, lp);
+        Net net;
+        net.addLayerToPrev(lp.name, lp.type, lp);
 
-    // Warmup
-    {
-        net.setInput(input);
-        net.setPreferableBackend(backend_id);
-        net.setPreferableTarget(target_id);
-        Mat out = net.forward();
-    }
+        // Warmup
+        {
+            net.setInput(input);
+            net.setPreferableBackend(backend_id);
+            net.setPreferableTarget(target_id);
+            net.forward();
+        }
 
-    TEST_CYCLE() {
-        net.forward();
+        TEST_CYCLE() {
+            net.forward();
+        }
+
+        SANITY_CHECK_NOTHING();
     }
 
-    SANITY_CHECK_NOTHING();
+    int N = 2;
+    int C = 32;
+    int H = 416;
+    int W = 416;
+};
+
+PERF_TEST_P_(Layer_Elementwise, Gelu) {
+    test_layer("Gelu", std::vector<int>{1, 50, 3072});
+}
+PERF_TEST_P_(Layer_Elementwise, Swish) {
+    test_layer("Swish", std::vector<int>{N, C, H, W});
+}
+PERF_TEST_P_(Layer_Elementwise, Mish) {
+    test_layer("Mish", std::vector<int>{N, C, H, W});
+}
+PERF_TEST_P_(Layer_Elementwise, Elu) {
+    test_layer("ELU", std::vector<int>{N, C, H, W});
+}
+PERF_TEST_P_(Layer_Elementwise, Celu) {
+    test_layer("Celu", std::vector<int>{N, C, H, W});
+}
+PERF_TEST_P_(Layer_Elementwise, Selu) {
+    test_layer("Selu", std::vector<int>{N, C, H, W});
+}
+PERF_TEST_P_(Layer_Elementwise, HardSwish) {
+    test_layer("HardSwish", std::vector<int>{N, C, H, W});
 }
 
-INSTANTIATE_TEST_CASE_P(/**/, Layer_Elementwise, testing::Combine(
-    testing::Values(std::vector<int>{1, 50, 3072}),
-    testing::Values(std::string{"Gelu"}),
-    dnnBackendsAndTargets(/* withInferenceEngine= */ true,
-                          /* withHalide= */          false,
-                          /* withCpuOCV= */          true,
-                          /* withVkCom= */           false,
-                          /* withCUDA= */            true,
-                          /* withNgraph= */          true,
-                          /* withWebnn= */           false,
-                          /* withCann= */            false) // only test on CPU
-));
+INSTANTIATE_TEST_CASE_P(/**/, Layer_Elementwise,
+                        dnnBackendsAndTargets(/* withInferenceEngine= */ true,
+                                              /* withHalide= */          false,
+                                              /* withCpuOCV= */          true,
+                                              /* withVkCom= */           false,
+                                              /* withCUDA= */            true,
+                                              /* withNgraph= */          true,
+                                              /* withWebnn= */           false,
+                                              /* withCann= */            false));
 
 } // namespace
diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp
index 477aad88be..6c06554d5f 100644
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@@ -859,12 +859,6 @@ struct GeluFunctor : public BaseFunctor {
                       one = vx_setall_f32(1.0f),
                       reciprocal_sqrt2 = vx_setall_f32(M_SQRT1_2);
             for (; i <= len - vlanes; i += vlanes) {
-                if (i + vlanes > len) {
-                    if (i == 0 || i == len) {
-                        break;
-                    }
-                    i = len - vlanes;
-                }
                 v_float32 x0 = vx_load(srcptr + i);
 
                 // t = x * M_SQRT1_2
@@ -1048,7 +1042,17 @@ const char* const TanHFunctor::BaseDefaultFunctor<TanHFunctor>::ocl_kernel_name
 
 struct SwishFunctor : public BaseDefaultFunctor<SwishFunctor>
 {
-    typedef SwishLayer Layer;
+    using Layer = SwishLayer;
+
+    int vlanes;
+
+    explicit SwishFunctor() {
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        vlanes = VTraits<v_float32>::vlanes();
+#else
+        vlanes = 1;
+#endif
+    }
 
     bool supportBackend(int backendId, int)
     {
@@ -1064,6 +1068,32 @@ struct SwishFunctor : public BaseDefaultFunctor<SwishFunctor>
         return x / (1.f + exp(-x));
     }
 
+    void apply(const float* srcptr, float* dstptr, int stripeStart, int len, size_t planeSize, int cn0, int cn1) const {
+        CV_UNUSED(stripeStart);
+        for (int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize) {
+            int i = 0;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+            // x / (1.f + exp(-x));
+            v_float32 one = vx_setall_f32(1.0f),
+                      zero = vx_setzero_f32();
+            for (; i <= len - vlanes; i += vlanes) {
+                v_float32 x = vx_load(srcptr + i);
+
+                v_float32 t = v_sub(zero, x);
+                t = v_exp(t);
+                t = v_add(one, t);
+                t = v_div(x, t);
+
+                vx_store(dstptr + i, t);
+            }
+#endif
+            // In case SIMD is not available or len < vlanes
+            for (; i < len; i++) {
+                dstptr[i] = calculate(srcptr[i]);
+            }
+        }
+    }
+
 #ifdef HAVE_CUDA
     Ptr<BackendNode> initCUDA(int target, csl::Stream stream)
     {
@@ -1116,9 +1146,27 @@ struct SwishFunctor : public BaseDefaultFunctor<SwishFunctor>
 template<>
 const char* const SwishFunctor::BaseDefaultFunctor<SwishFunctor>::ocl_kernel_name = "SwishForward";
 
+namespace {
+    constexpr float MISH_THRESHOLD = -36.73f;
+}
+
+/*
+    This implementation is derived from
+    https://github.com/vpisarev/ficus/blob/3c9a8b78f49e17489c5e1fd6dd5dd487348c99c2/lib/NN/OpElemwise.fx#L110
+*/
 struct MishFunctor : public BaseDefaultFunctor<MishFunctor>
 {
-    typedef MishLayer Layer;
+    using Layer = MishLayer;
+
+    int vlanes;
+
+    explicit MishFunctor() {
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        vlanes = VTraits<v_float32>::vlanes();
+#else
+        vlanes = 1;
+#endif
+    }
 
     bool supportBackend(int backendId, int)
     {
@@ -1131,15 +1179,34 @@ struct MishFunctor : public BaseDefaultFunctor<MishFunctor>
 
     inline float calculate(float x) const
     {
-        // Use fast approximation introduced in https://github.com/opencv/opencv/pull/17200
-        if (x >= 8.f)
-        {
-            return x;
-        }
+        float y = x > MISH_THRESHOLD ? std::exp(-x) : 1.f;
+        x *= x > MISH_THRESHOLD ? 1.f : 0.f;
+        return x * (1 + 2 * y) / (1 + 2 * y + 2 * y * y);
+    }
 
-        float eX = exp(x);
-        float n = (eX + 2.f) * eX;
-        return (x * n) / (n + 2.f);
+    void apply(const float* srcptr, float* dstptr, int stripeStart, int len, size_t planeSize, int cn0, int cn1) const {
+        CV_UNUSED(stripeStart);
+        for (int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize) {
+            int i = 0;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+            v_float32 v_threshold = vx_setall_f32(MISH_THRESHOLD), one = vx_setall_f32(1.f), z = vx_setzero_f32();
+            for (; i <= len - vlanes; i += vlanes) {
+                v_float32 x = vx_load(srcptr + i);
+
+                x = v_select(v_le(x, v_threshold), z, x);
+                v_float32 y = v_exp(v_sub(z, x));
+                v_float32 _2y = v_add(y, y),
+                          _2ya1 = v_add(_2y, one);
+                x = v_div(v_mul(x, _2ya1), v_add(_2ya1, v_mul(_2y, y)));
+
+                vx_store(dstptr + i, x);
+            }
+#endif
+            // In case SIMD is not available or len < vlanes
+            for (; i < len; i++) {
+                dstptr[i] = calculate(srcptr[i]);
+            }
+        }
     }
 
 #ifdef HAVE_CUDA
@@ -1270,10 +1337,18 @@ const char* const SigmoidFunctor::BaseDefaultFunctor<SigmoidFunctor>::ocl_kernel
 
 struct ELUFunctor : public BaseDefaultFunctor<ELUFunctor>
 {
-    typedef ELULayer Layer;
+    using Layer = ELULayer;
+
     float alpha;
+    int vlanes;
 
-    explicit ELUFunctor(float alpha_ = 1.f) : alpha(alpha_) {}
+    explicit ELUFunctor(float alpha_ = 1.f) : alpha(alpha_) {
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        vlanes = VTraits<v_float32>::vlanes();
+#else
+        vlanes = 1;
+#endif
+    }
 
     bool supportBackend(int backendId, int)
     {
@@ -1292,6 +1367,28 @@ struct ELUFunctor : public BaseDefaultFunctor<ELUFunctor>
         return x >= 0.f ? x : alpha * (exp(x) - 1.f);
     }
 
+    void apply(const float* srcptr, float* dstptr, int stripeStart, int len, size_t planeSize, int cn0, int cn1) const {
+        CV_UNUSED(stripeStart);
+        for (int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize) {
+            int i = 0;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+            v_float32 z = vx_setzero_f32(), v_alpha = vx_setall_f32(alpha), one = vx_setall_f32(1.0f);
+            for (; i <= len - vlanes; i += vlanes) {
+                v_float32 x = vx_load(srcptr + i);
+
+                v_float32 t = v_mul(v_alpha, v_sub(v_exp(x), one));
+                x = v_select(v_ge(x, z), x, t);
+
+                vx_store(dstptr + i, x);
+            }
+#endif
+            // In case SIMD is not available or len < vlanes
+            for (; i < len; i++) {
+                dstptr[i] = calculate(srcptr[i]);
+            }
+        }
+    }
+
     inline void setKernelParams(ocl::Kernel& kernel) const
     {
         kernel.set(3, alpha);
@@ -1991,7 +2088,16 @@ const char* const BaseDefaultFunctor<ErfFunctor>::ocl_kernel_name = "ErfForward"
 
 struct HardSwishFunctor : public BaseDefaultFunctor<HardSwishFunctor>
 {
-    typedef HardSwishLayer Layer;
+    using Layer = HardSwishLayer;
+    int vlanes;
+
+    explicit HardSwishFunctor() {
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        vlanes = VTraits<v_float32>::vlanes();
+#else
+        vlanes = 1;
+#endif
+    }
 
     bool supportBackend(int backendId, int)
     {
@@ -2002,7 +2108,32 @@ struct HardSwishFunctor : public BaseDefaultFunctor<HardSwishFunctor>
 
     inline float calculate(float x) const
     {
-        return x * max(0.f, min(1.f, x / 6.f + 0.5f));
+        return x * std::max(0.f, std::min(1.f, x / 6.f + 0.5f));
+    }
+
+    void apply(const float* srcptr, float* dstptr, int stripeStart, int len, size_t planeSize, int cn0, int cn1) const {
+        CV_UNUSED(stripeStart);
+        for (int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize) {
+            int i = 0;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+            v_float32 zero = vx_setzero_f32(), one = vx_setall_f32(1.0f),
+                      half = vx_setall_f32(0.5f), sixth = vx_setall_f32(1 / 6.0f);
+            for (; i <= len - vlanes; i += vlanes) {
+                v_float32 x = vx_load(srcptr + i);
+
+                v_float32 t = v_add(v_mul(x, sixth), half);
+                t = v_min(one, t);
+                t = v_max(zero, t);
+                t = v_mul(x, t);
+
+                vx_store(dstptr + i, t);
+            }
+#endif
+            // In case SIMD is not available or len > vlanes
+            for (; i < len; i++) {
+                dstptr[i] = calculate(srcptr[i]);
+            }
+        }
     }
 
 #ifdef HAVE_CUDA
@@ -2176,11 +2307,18 @@ const char* const BaseDefaultFunctor<TanFunctor>::ocl_kernel_name = "TanForward"
 
 struct CeluFunctor : public BaseDefaultFunctor<CeluFunctor>
 {
-    typedef CeluLayer Layer;
+    using Layer = CeluLayer;
 
     float alpha;
+    int vlanes;
 
-    explicit CeluFunctor(float alpha_ = 1.f) : alpha(alpha_) {}
+    explicit CeluFunctor(float alpha_ = 1.f) : alpha(alpha_) {
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        vlanes = VTraits<v_float32>::vlanes();
+#else
+        vlanes = 1;
+#endif
+    }
 
     bool supportBackend(int backendId, int)
     {
@@ -2189,7 +2327,30 @@ struct CeluFunctor : public BaseDefaultFunctor<CeluFunctor>
 
     inline float calculate(float x) const
     {
-        return max(0.f, x) + min(0.f, alpha * expm1(x / alpha));
+        return std::max(0.f, x) + std::min(0.f, alpha * expm1(x / alpha));
+    }
+
+    void apply(const float* srcptr, float* dstptr, int stripeStart, int len, size_t planeSize, int cn0, int cn1) const {
+        CV_UNUSED(stripeStart);
+        for (int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize) {
+            int i = 0;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+            v_float32 zero = vx_setzero_f32(), v_alpha = vx_setall_f32(alpha),
+                      one = vx_setall_f32(1.0f), v_ralpha = vx_setall_f32(1.0f / alpha);
+            for (; i <= len - vlanes; i += vlanes) {
+                v_float32 x = vx_load(srcptr + i);
+
+                v_float32 t = v_min(zero, v_mul(v_alpha, v_sub(v_exp(v_mul(x, v_ralpha)), one)));
+                t = v_add(v_max(zero, x), t);
+
+                vx_store(dstptr + i, t);
+            }
+#endif
+            // In case SIMD is not available or len < vlanes
+            for (; i < len; i++) {
+                dstptr[i] = calculate(srcptr[i]);
+            }
+        }
     }
 
     inline void setKernelParams(ocl::Kernel& kernel) const
@@ -2250,13 +2411,21 @@ const char* const BaseDefaultFunctor<HardSigmoidFunctor>::ocl_kernel_name = "Har
 
 struct SeluFunctor : public BaseDefaultFunctor<SeluFunctor>
 {
-    typedef SeluLayer Layer;
+    using Layer = SeluLayer;
 
     float alpha;
     float gamma;
+    int vlanes;
 
     explicit SeluFunctor(float alpha_ = 1.67326319217681884765625f,
-                         float gamma_ = 1.05070102214813232421875f) : alpha(alpha_), gamma(gamma_) {}
+                         float gamma_ = 1.05070102214813232421875f)
+        : alpha(alpha_), gamma(gamma_) {
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        vlanes = VTraits<v_float32>::vlanes();
+#else
+        vlanes = 1;
+#endif
+    }
 
     bool supportBackend(int backendId, int)
     {
@@ -2268,6 +2437,30 @@ struct SeluFunctor : public BaseDefaultFunctor<SeluFunctor>
         return gamma * (x > 0.f ? x : alpha * expm1(x));
     }
 
+    void apply(const float* srcptr, float* dstptr, int stripeStart, int len, size_t planeSize, int cn0, int cn1) const {
+        CV_UNUSED(stripeStart);
+        for (int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize) {
+            int i = 0;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+            v_float32 z = vx_setzero_f32(), one = vx_setall_f32(1.0f),
+                      v_alpha = vx_setall_f32(alpha), v_gamma = vx_setall_f32(gamma);
+            for (; i <= len - vlanes; i += vlanes) {
+                v_float32 x = vx_load(srcptr + i);
+
+                v_float32 t = v_mul(v_alpha, v_sub(v_exp(x), one));
+                x = v_select(v_le(x, z), t, x);
+                x = v_mul(v_gamma, x);
+
+                vx_store(dstptr + i, x);
+            }
+#endif
+            // In case SIMD is not available or len > vlanes
+            for (; i < len; i++) {
+                dstptr[i] = calculate(srcptr[i]);
+            }
+        }
+    }
+
     inline void setKernelParams(ocl::Kernel& kernel) const
     {
         kernel.set(3, alpha);
diff --git a/modules/dnn/test/test_onnx_conformance.cpp b/modules/dnn/test/test_onnx_conformance.cpp
index 47e6f22fce..57969ced87 100644
--- a/modules/dnn/test/test_onnx_conformance.cpp
+++ b/modules/dnn/test/test_onnx_conformance.cpp
@@ -250,7 +250,10 @@ static const TestCase testConformanceConfig[] = {
     {"test_einsum_transpose", 1, 1},
     {"test_elu", 1, 1},
     {"test_elu_default", 1, 1},
+    {"test_elu_default_expanded_ver18", 1, 1},
     {"test_elu_example", 1, 1},
+    {"test_elu_example_expanded_ver18", 1, 1},
+    {"test_elu_expanded_ver18", 1, 1},
     {"test_equal", 2, 1},
     {"test_equal_bcast", 2, 1},
     {"test_erf", 1, 1},
@@ -454,6 +457,8 @@ static const TestCase testConformanceConfig[] = {
     {"test_min_uint32", 2, 1},
     {"test_min_uint64", 2, 1},
     {"test_min_uint8", 2, 1},
+    {"test_mish", 1, 1},
+    {"test_mish_expanded", 1, 1},
     {"test_mod_broadcast", 2, 1},
     {"test_mod_int64_fmod", 2, 1},
     {"test_mod_mixed_sign_float16", 2, 1},
@@ -775,7 +780,10 @@ static const TestCase testConformanceConfig[] = {
     {"test_sce_sum_log_prob_expanded", 2, 2},
     {"test_selu", 1, 1},
     {"test_selu_default", 1, 1},
+    {"test_selu_default_expanded_ver18", 1, 1},
     {"test_selu_example", 1, 1},
+    {"test_selu_example_expanded_ver18", 1, 1},
+    {"test_selu_expanded_ver18", 1, 1},
     {"test_sequence_insert_at_back", 2, 1},
     {"test_sequence_insert_at_front", 3, 1},
     {"test_shape", 1, 1},
diff --git a/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp b/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp
index cbbc349bda..9b2a2f4f2d 100644
--- a/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp
@@ -624,8 +624,14 @@ CASE(test_elu)
     // no filter
 CASE(test_elu_default)
     // no filter
+CASE(test_elu_default_expanded_ver18)
+    // no filter
 CASE(test_elu_example)
     // no filter
+CASE(test_elu_example_expanded_ver18)
+    // no filter
+CASE(test_elu_expanded_ver18)
+    // no filter
 CASE(test_equal)
     // no filter
 CASE(test_equal_bcast)
@@ -1098,6 +1104,10 @@ CASE(test_min_uint64)
     // no filter
 CASE(test_min_uint8)
     // no filter
+CASE(test_mish)
+    // no filter
+CASE(test_mish_expanded)
+    // no filter
 CASE(test_mod_broadcast)
     // no filter
 CASE(test_mod_int64_fmod)
@@ -1851,8 +1861,14 @@ CASE(test_selu)
     // no filter
 CASE(test_selu_default)
     // no filter
+CASE(test_selu_default_expanded_ver18)
+    // no filter
 CASE(test_selu_example)
     // no filter
+CASE(test_selu_example_expanded_ver18)
+    // no filter
+CASE(test_selu_expanded_ver18)
+    // no filter
 CASE(test_sequence_insert_at_back)
     // no filter
 CASE(test_sequence_insert_at_front)
diff --git a/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp b/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
index 7253a64cef..78c26eeea2 100644
--- a/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
@@ -103,6 +103,9 @@
 "test_dynamicquantizelinear_min_adjusted_expanded",
 "test_edge_pad",
 "test_einsum_inner_prod",
+"test_elu_default_expanded_ver18",
+"test_elu_example_expanded_ver18",
+"test_elu_expanded_ver18",
 "test_equal",
 "test_equal_bcast",
 "test_expand_dim_changed",
@@ -412,6 +415,9 @@
 "test_sce_sum_expanded",
 "test_sce_sum_log_prob",
 "test_sce_sum_log_prob_expanded",
+"test_selu_default_expanded_ver18",
+"test_selu_example_expanded_ver18",
+"test_selu_expanded_ver18",
 "test_sequence_insert_at_back",
 "test_sequence_insert_at_front",
 "test_shape",