diff --git a/modules/dnn/perf/perf_layer.cpp b/modules/dnn/perf/perf_layer.cpp index acdc778b3c..ea1e70ae30 100644 --- a/modules/dnn/perf/perf_layer.cpp +++ b/modules/dnn/perf/perf_layer.cpp @@ -975,4 +975,49 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_Softmax, Combine( /* withCann= */ false) // only test on CPU )); +using Layer_Elementwise = TestBaseWithParam, std::string, tuple>>; +PERF_TEST_P_(Layer_Elementwise, elementwise) { + std::vector input_shape = get<0>(GetParam()); + std::string op = get<1>(GetParam()); + int backend_id = get<0>(get<2>(GetParam())); + int target_id = get<1>(get<2>(GetParam())); + + Mat input(input_shape, CV_32F); + randn(input, 0.f, 1.f); + + LayerParams lp; + lp.type = op; + lp.name = "TestLayer"; + + Net net; + net.addLayerToPrev(lp.name, lp.type, lp); + + // Warmup + { + net.setInput(input); + net.setPreferableBackend(backend_id); + net.setPreferableTarget(target_id); + Mat out = net.forward(); + } + + TEST_CYCLE() { + net.forward(); + } + + SANITY_CHECK_NOTHING(); +} + +INSTANTIATE_TEST_CASE_P(/**/, Layer_Elementwise, testing::Combine( + testing::Values(std::vector{1, 50, 3072}), + testing::Values(std::string{"Gelu"}), + dnnBackendsAndTargets(/* withInferenceEngine= */ true, + /* withHalide= */ false, + /* withCpuOCV= */ true, + /* withVkCom= */ false, + /* withCUDA= */ true, + /* withNgraph= */ true, + /* withWebnn= */ false, + /* withCann= */ false) // only test on CPU +)); + } // namespace diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp index 6b7909b1b7..477aad88be 100644 --- a/modules/dnn/src/layers/elementwise_layers.cpp +++ b/modules/dnn/src/layers/elementwise_layers.cpp @@ -813,20 +813,82 @@ private: static const char* const ocl_kernel_name; }; -struct GeluFunctor : public BaseDefaultFunctor -{ - typedef GeluLayer Layer; +namespace { + // Refer to v_erf in modules/core/include/opencv2/core/hal/intrin_math.hpp + constexpr float c_erf_coef0 = 0.3275911f; + constexpr float c_erf_coef1 = 1.061405429f; + constexpr float c_erf_coef2 = -1.453152027f; + constexpr float c_erf_coef3 = 1.421413741f; + constexpr float c_erf_coef4 = -0.284496736f; + constexpr float c_erf_coef5 = 0.254829592f; + + inline float erf_approx(float v) { + float t = 1.f / fmaf(fabsf(v), c_erf_coef0, 1.f); + float r = fmaf(c_erf_coef1, t, c_erf_coef2); + r = fmaf(r, t, c_erf_coef3); + r = fmaf(r, t, c_erf_coef4); + r = fmaf(r, t, c_erf_coef5); + r = 1.f - r * t * expf(-v * v); + return std::copysignf(r, v); + } +} - explicit GeluFunctor() {} +struct GeluFunctor : public BaseFunctor { + using Layer = GeluLayer; + int vlanes; - bool supportBackend(int backendId, int) - { - return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_CUDA; + explicit GeluFunctor() { +#if (CV_SIMD || CV_SIMD_SCALABLE) + vlanes = VTraits::vlanes(); +#else + vlanes = 1; +#endif } - inline float calculate(float x) const - { - return 0.5f * x * (1.0f + erf(x * M_SQRT1_2)); + bool supportBackend(int backendId, int) { + return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_CUDA || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH; + } + + void apply(const float* srcptr, float* dstptr, int stripeStart, int len, size_t planeSize, int cn0, int cn1) const { + CV_UNUSED(stripeStart); + for (int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize) { + int i = 0; +#if (CV_SIMD || CV_SIMD_SCALABLE) + // 0.5f * x * (1.0f + erf(x * M_SQRT1_2)); + v_float32 half = vx_setall_f32(0.5f), + one = vx_setall_f32(1.0f), + reciprocal_sqrt2 = vx_setall_f32(M_SQRT1_2); + for (; i <= len - vlanes; i += vlanes) { + if (i + vlanes > len) { + if (i == 0 || i == len) { + break; + } + i = len - vlanes; + } + v_float32 x0 = vx_load(srcptr + i); + + // t = x * M_SQRT1_2 + v_float32 t0 = v_mul(reciprocal_sqrt2, x0); + + // t = 1.0f + t + t0 = v_add(one, v_erf(t0)); + + // x = 0.5 * x + x0 = v_mul(half, x0); + + // x = x * t + x0 = v_mul(x0, t0); + + vx_store(dstptr + i, x0); + } +#endif + // 0.5f * x * (1.0f + erf(x * M_SQRT1_2)); + for( ; i < len; i++ ) + { + float x = srcptr[i]; + dstptr[i] = 0.5f * x * (1.0f + erf_approx(x * M_SQRT1_2)); + } + } } #ifdef HAVE_CUDA @@ -836,12 +898,55 @@ struct GeluFunctor : public BaseDefaultFunctor } #endif +#ifdef HAVE_OPENCL + bool initKernel(ocl::Kernel &ker, const UMat &src) const + { + String buildopt = oclGetTMacro(src); + + if (!ker.create("GeluForward", ocl::dnn::activations_oclsrc, buildopt)) + return false; + + return true; + } + + bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) + { + std::vector inputs; + std::vector outputs; + + inps.getUMatVector(inputs); + outs.getUMatVector(outputs); + + for (size_t i = 0; i < inputs.size(); i++) + { + UMat& src = inputs[i]; + UMat& dst = outputs[i]; + CV_Assert(src.isContinuous() && dst.isContinuous() && !src.offset && !dst.offset); + + ocl::Kernel kernel; + CV_Assert(initKernel(kernel, src)); + kernel.set(0, (int)src.total()); + kernel.set(1, ocl::KernelArg::PtrReadOnly(src)); + kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst)); + + size_t gSize = src.total(); + CV_Assert(kernel.run(1, &gSize, NULL, false)); + } + + return true; + } +#endif + +#ifdef HAVE_DNN_NGRAPH + std::shared_ptr initNgraphAPI(const ngraph::Output& node) + { + return std::make_shared(node); + } +#endif // HAVE_DNN_NGRAPH + int64 getFLOPSPerElement() const { return 100; } }; -template<> -const char* const BaseDefaultFunctor::ocl_kernel_name = "GeluForward"; - namespace GeluApproximationConstants { static constexpr float sqrt_2_pi = 0.7978845834732056f; diff --git a/modules/dnn/src/opencl/activations.cl b/modules/dnn/src/opencl/activations.cl index 96b56725fb..bbd03b2ea1 100644 --- a/modules/dnn/src/opencl/activations.cl +++ b/modules/dnn/src/opencl/activations.cl @@ -48,6 +48,10 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable #endif +#if !defined(M_SQRT1_2) +#define M_SQRT1_2 0.707106781186547524400844362104849039 /* 1/sqrt(2) */ +#endif + __kernel void ReLUForward(const int count, __global const T* in, __global T* out #ifndef RELU_NO_SLOPE , KERNEL_ARG_DTYPE negative_slope diff --git a/modules/dnn/test/test_onnx_conformance.cpp b/modules/dnn/test/test_onnx_conformance.cpp index bd892adb2f..8af34695dd 100644 --- a/modules/dnn/test/test_onnx_conformance.cpp +++ b/modules/dnn/test/test_onnx_conformance.cpp @@ -282,6 +282,14 @@ static const TestCase testConformanceConfig[] = { {"test_gathernd_example_float32", 2, 1}, {"test_gathernd_example_int32", 2, 1}, {"test_gathernd_example_int32_batch_dim1", 2, 1}, + {"test_gelu_default_1", 1, 1}, + {"test_gelu_default_1_expanded", 1, 1}, + {"test_gelu_default_2", 1, 1}, + {"test_gelu_default_2_expanded", 1, 1}, + {"test_gelu_tanh_1", 1, 1}, + {"test_gelu_tanh_1_expanded", 1, 1}, + {"test_gelu_tanh_2", 1, 1}, + {"test_gelu_tanh_2_expanded", 1, 1}, {"test_gemm_all_attributes", 3, 1}, {"test_gemm_alpha", 3, 1}, {"test_gemm_beta", 3, 1}, @@ -1123,6 +1131,19 @@ TEST_P(Test_ONNX_conformance, Layer_Test) if (name == "test_pow") { default_lInf = 0.00013; // Expected: (normInf) <= (lInf), actual: 0.00012207 vs 0.0001 } + if (name == "test_gelu_tanh_1") { + default_l1 = 0.00011; // Expected: (normL1) <= (l1), actual: 0.000101805 vs 1e-05 + default_lInf = 0.00016; // Expected: (normInf) <= (lInf), actual: 0.000152707 vs 0.0001 + } + if (name == "test_gelu_tanh_2") { + if (target == DNN_TARGET_OPENCL_FP16) { + default_l1 = 0.00016; // Expected: (normL1) <= (l1), actual: 0.000157223 vs 9e-05 + default_lInf = 0.0016; // Expected: (normInf) <= (lInf), actual: 0.00153041 vs 0.0005 + } else { + default_l1 = 9e-5; // Expected: (normL1) <= (l1), actual: 8.80073e-05 vs 1e-05 + default_lInf = 0.0005; // Expected: (normInf) <= (lInf), actual: 0.000455521 vs 0.0001 + } + } } #ifdef HAVE_HALIDE else if (backend == DNN_BACKEND_HALIDE) @@ -1146,6 +1167,15 @@ TEST_P(Test_ONNX_conformance, Layer_Test) { applyTestTag(CV_TEST_TAG_DNN_SKIP_VULKAN, CV_TEST_TAG_DNN_SKIP_ONNX_CONFORMANCE); } + + if (name == "test_gelu_tanh_1") { + default_l1 = 0.00011; // Expected: (normL1) <= (l1), actual: 0.000101805 vs 1e-05 + default_lInf = 0.00016; // Expected: (normInf) <= (lInf), actual: 0.000152707 vs 0.0001 + } + if (name == "test_gelu_tanh_2") { + default_l1 = 9e-5; // Expected: (normL1) <= (l1), actual: 8.80073e-05 vs 1e-05 + default_lInf = 0.0005; // Expected: (normInf) <= (lInf), actual: 0.000455521 vs 0.0001 + } } #endif #ifdef HAVE_CUDA @@ -1159,6 +1189,20 @@ TEST_P(Test_ONNX_conformance, Layer_Test) { applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16, CV_TEST_TAG_DNN_SKIP_ONNX_CONFORMANCE); } + + if (name == "test_gelu_tanh_1") { + default_l1 = 0.00011; // Expected: (normL1) <= (l1), actual: 0.000101815 vs 1e-05 + default_lInf = 0.00016; // Expected: (normInf) <= (lInf), actual: 0.000152737 vs 0.0001 + } + if (name == "test_gelu_tanh_2") { + if (target == DNN_TARGET_CUDA_FP16) { + default_l1 = 0.00023; // Expected: (normL1) <= (l1), actual: 0.000220591 vs 9e-05 + default_lInf = 0.0023; // Expected: (normInf) <= (lInf), actual: 0.00220466 vs 0.0005 + } else { + default_l1 = 9e-5; // Expected: (normL1) <= (l1), actual: 8.80127e-05 vs 1e-05 + default_lInf = 0.0005; // Expected: (normInf) <= (lInf), actual: 0.000455445 vs 0.0001 + } + } } #endif else diff --git a/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp b/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp index 229bb9ca82..cbbc349bda 100644 --- a/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp +++ b/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp @@ -688,6 +688,36 @@ CASE(test_gathernd_example_int32) // no filter CASE(test_gathernd_example_int32_batch_dim1) // no filter +CASE(test_gelu_default_1) + // no filter +CASE(test_gelu_default_1_expanded) + // no filter +CASE(test_gelu_default_2) + // no filter +CASE(test_gelu_default_2_expanded) + // no filter +CASE(test_gelu_tanh_1) + if (target == DNN_TARGET_CPU) { + default_l1 = 0.00011; // Expected: (normL1) <= (l1), actual: 0.000101805 vs 1e-05 + default_lInf = 0.00016; // Expected: (normInf) <= (lInf), actual: 0.000152707 vs 0.0001 + } + if (target == DNN_TARGET_OPENCL) { + default_l1 = 0.00011; // Expected: (normL1) <= (l1), actual: 0.000101815 vs 1e-05 + default_lInf = 0.00016; // Expected: (normInf) <= (lInf), actual: 0.000152737 vs 0.0001 + } +CASE(test_gelu_tanh_1_expanded) + // no filter +CASE(test_gelu_tanh_2) + if (target == DNN_TARGET_CPU) { + default_l1 = 9e-5; // Expected: (normL1) <= (l1), actual: 8.80057e-05 vs 1e-05 + default_lInf = 0.00046; // Expected: (normInf) <= (lInf), actual: 0.000455521 vs 0.0001 + } + if (target == DNN_TARGET_OPENCL) { + default_l1 = 9e-5; // Expected: (normL1) <= (l1), actual: 8.80144e-05 vs 1e-05 + default_lInf = 0.00046; // Expected: (normInf) <= (lInf), actual: 0.000455445 vs 0.0001 + } +CASE(test_gelu_tanh_2_expanded) + // no filter CASE(test_gemm_all_attributes) // no filter CASE(test_gemm_alpha) diff --git a/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp b/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp index 243c7e704d..7253a64cef 100644 --- a/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp +++ b/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp @@ -117,6 +117,10 @@ "test_gathernd_example_float32", "test_gathernd_example_int32", "test_gathernd_example_int32_batch_dim1", +"test_gelu_default_1_expanded", // parser: no corresponding layer for CastLike +"test_gelu_default_2_expanded", // parser: no corresponding layer for CastLike +"test_gelu_tanh_1_expanded", // parser: no corresponding layer for CastLike +"test_gelu_tanh_2_expanded", // parser: no corresponding layer for CastLike "test_gemm_all_attributes", "test_gemm_alpha", "test_gemm_beta", diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp index e560ff2dbe..e58d83cdbd 100644 --- a/modules/dnn/test/test_onnx_importer.cpp +++ b/modules/dnn/test/test_onnx_importer.cpp @@ -3149,6 +3149,15 @@ TEST_P(Test_ONNX_nets, ViT_B_32) { l1 = 0.008; lInf = 0.04; } + if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) { + if (target == DNN_TARGET_CPU) { + l1 = 4.4e-5; // Expected: (normL1) <= (l1), actual: 4.31208e-05 vs 1e-05 + lInf = 0.0002; // Expected: (normInf) <= (lInf), actual: 0.000194907 vs 0.0001 + } else if (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16) { + l1 = 0.0092; // Expected: (normL1) <= (l1), actual: 0.00918349 vs 4.4e-05 + lInf = 0.056; // Expected: (normInf) <= (lInf), actual: 0.0556431 vs 0.0002 + } + } normAssert(ref, out, "ViTB_32", l1, lInf); }