Merge pull request #25147 from fengyuentau:dnn/elementwise_layers/speedup

* added v_erf and implemented gelu acceleration via vectorization

* remove anonymous v_erf and use v_erf from intrin_math

* enable perf for ov and cuda backend
pull/25494/merge
Yuantao Feng 4 months ago committed by GitHub
parent 31b308f882
commit e3858cc5a3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 45
      modules/dnn/perf/perf_layer.cpp
  2. 129
      modules/dnn/src/layers/elementwise_layers.cpp
  3. 4
      modules/dnn/src/opencl/activations.cl
  4. 44
      modules/dnn/test/test_onnx_conformance.cpp
  5. 30
      modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp
  6. 4
      modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
  7. 9
      modules/dnn/test/test_onnx_importer.cpp

@ -975,4 +975,49 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_Softmax, Combine(
/* withCann= */ false) // only test on CPU
));
using Layer_Elementwise = TestBaseWithParam<tuple<std::vector<int>, std::string, tuple<Backend, Target>>>;
PERF_TEST_P_(Layer_Elementwise, elementwise) {
std::vector<int> input_shape = get<0>(GetParam());
std::string op = get<1>(GetParam());
int backend_id = get<0>(get<2>(GetParam()));
int target_id = get<1>(get<2>(GetParam()));
Mat input(input_shape, CV_32F);
randn(input, 0.f, 1.f);
LayerParams lp;
lp.type = op;
lp.name = "TestLayer";
Net net;
net.addLayerToPrev(lp.name, lp.type, lp);
// Warmup
{
net.setInput(input);
net.setPreferableBackend(backend_id);
net.setPreferableTarget(target_id);
Mat out = net.forward();
}
TEST_CYCLE() {
net.forward();
}
SANITY_CHECK_NOTHING();
}
INSTANTIATE_TEST_CASE_P(/**/, Layer_Elementwise, testing::Combine(
testing::Values(std::vector<int>{1, 50, 3072}),
testing::Values(std::string{"Gelu"}),
dnnBackendsAndTargets(/* withInferenceEngine= */ true,
/* withHalide= */ false,
/* withCpuOCV= */ true,
/* withVkCom= */ false,
/* withCUDA= */ true,
/* withNgraph= */ true,
/* withWebnn= */ false,
/* withCann= */ false) // only test on CPU
));
} // namespace

@ -813,20 +813,82 @@ private:
static const char* const ocl_kernel_name;
};
struct GeluFunctor : public BaseDefaultFunctor<GeluFunctor>
{
typedef GeluLayer Layer;
namespace {
// Refer to v_erf in modules/core/include/opencv2/core/hal/intrin_math.hpp
constexpr float c_erf_coef0 = 0.3275911f;
constexpr float c_erf_coef1 = 1.061405429f;
constexpr float c_erf_coef2 = -1.453152027f;
constexpr float c_erf_coef3 = 1.421413741f;
constexpr float c_erf_coef4 = -0.284496736f;
constexpr float c_erf_coef5 = 0.254829592f;
explicit GeluFunctor() {}
inline float erf_approx(float v) {
float t = 1.f / fmaf(fabsf(v), c_erf_coef0, 1.f);
float r = fmaf(c_erf_coef1, t, c_erf_coef2);
r = fmaf(r, t, c_erf_coef3);
r = fmaf(r, t, c_erf_coef4);
r = fmaf(r, t, c_erf_coef5);
r = 1.f - r * t * expf(-v * v);
return std::copysignf(r, v);
}
}
bool supportBackend(int backendId, int)
{
return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_CUDA;
struct GeluFunctor : public BaseFunctor {
using Layer = GeluLayer;
int vlanes;
explicit GeluFunctor() {
#if (CV_SIMD || CV_SIMD_SCALABLE)
vlanes = VTraits<v_float32>::vlanes();
#else
vlanes = 1;
#endif
}
inline float calculate(float x) const
bool supportBackend(int backendId, int) {
return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_CUDA || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
}
void apply(const float* srcptr, float* dstptr, int stripeStart, int len, size_t planeSize, int cn0, int cn1) const {
CV_UNUSED(stripeStart);
for (int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize) {
int i = 0;
#if (CV_SIMD || CV_SIMD_SCALABLE)
// 0.5f * x * (1.0f + erf(x * M_SQRT1_2));
v_float32 half = vx_setall_f32(0.5f),
one = vx_setall_f32(1.0f),
reciprocal_sqrt2 = vx_setall_f32(M_SQRT1_2);
for (; i <= len - vlanes; i += vlanes) {
if (i + vlanes > len) {
if (i == 0 || i == len) {
break;
}
i = len - vlanes;
}
v_float32 x0 = vx_load(srcptr + i);
// t = x * M_SQRT1_2
v_float32 t0 = v_mul(reciprocal_sqrt2, x0);
// t = 1.0f + t
t0 = v_add(one, v_erf(t0));
// x = 0.5 * x
x0 = v_mul(half, x0);
// x = x * t
x0 = v_mul(x0, t0);
vx_store(dstptr + i, x0);
}
#endif
// 0.5f * x * (1.0f + erf(x * M_SQRT1_2));
for( ; i < len; i++ )
{
return 0.5f * x * (1.0f + erf(x * M_SQRT1_2));
float x = srcptr[i];
dstptr[i] = 0.5f * x * (1.0f + erf_approx(x * M_SQRT1_2));
}
}
}
#ifdef HAVE_CUDA
@ -836,12 +898,55 @@ struct GeluFunctor : public BaseDefaultFunctor<GeluFunctor>
}
#endif
#ifdef HAVE_OPENCL
bool initKernel(ocl::Kernel &ker, const UMat &src) const
{
String buildopt = oclGetTMacro(src);
if (!ker.create("GeluForward", ocl::dnn::activations_oclsrc, buildopt))
return false;
return true;
}
bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
{
std::vector<UMat> inputs;
std::vector<UMat> outputs;
inps.getUMatVector(inputs);
outs.getUMatVector(outputs);
for (size_t i = 0; i < inputs.size(); i++)
{
UMat& src = inputs[i];
UMat& dst = outputs[i];
CV_Assert(src.isContinuous() && dst.isContinuous() && !src.offset && !dst.offset);
ocl::Kernel kernel;
CV_Assert(initKernel(kernel, src));
kernel.set(0, (int)src.total());
kernel.set(1, ocl::KernelArg::PtrReadOnly(src));
kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst));
size_t gSize = src.total();
CV_Assert(kernel.run(1, &gSize, NULL, false));
}
return true;
}
#endif
#ifdef HAVE_DNN_NGRAPH
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
{
return std::make_shared<ov::op::v0::Gelu>(node);
}
#endif // HAVE_DNN_NGRAPH
int64 getFLOPSPerElement() const { return 100; }
};
template<>
const char* const BaseDefaultFunctor<GeluFunctor>::ocl_kernel_name = "GeluForward";
namespace GeluApproximationConstants
{
static constexpr float sqrt_2_pi = 0.7978845834732056f;

@ -48,6 +48,10 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
#if !defined(M_SQRT1_2)
#define M_SQRT1_2 0.707106781186547524400844362104849039 /* 1/sqrt(2) */
#endif
__kernel void ReLUForward(const int count, __global const T* in, __global T* out
#ifndef RELU_NO_SLOPE
, KERNEL_ARG_DTYPE negative_slope

@ -282,6 +282,14 @@ static const TestCase testConformanceConfig[] = {
{"test_gathernd_example_float32", 2, 1},
{"test_gathernd_example_int32", 2, 1},
{"test_gathernd_example_int32_batch_dim1", 2, 1},
{"test_gelu_default_1", 1, 1},
{"test_gelu_default_1_expanded", 1, 1},
{"test_gelu_default_2", 1, 1},
{"test_gelu_default_2_expanded", 1, 1},
{"test_gelu_tanh_1", 1, 1},
{"test_gelu_tanh_1_expanded", 1, 1},
{"test_gelu_tanh_2", 1, 1},
{"test_gelu_tanh_2_expanded", 1, 1},
{"test_gemm_all_attributes", 3, 1},
{"test_gemm_alpha", 3, 1},
{"test_gemm_beta", 3, 1},
@ -1123,6 +1131,19 @@ TEST_P(Test_ONNX_conformance, Layer_Test)
if (name == "test_pow") {
default_lInf = 0.00013; // Expected: (normInf) <= (lInf), actual: 0.00012207 vs 0.0001
}
if (name == "test_gelu_tanh_1") {
default_l1 = 0.00011; // Expected: (normL1) <= (l1), actual: 0.000101805 vs 1e-05
default_lInf = 0.00016; // Expected: (normInf) <= (lInf), actual: 0.000152707 vs 0.0001
}
if (name == "test_gelu_tanh_2") {
if (target == DNN_TARGET_OPENCL_FP16) {
default_l1 = 0.00016; // Expected: (normL1) <= (l1), actual: 0.000157223 vs 9e-05
default_lInf = 0.0016; // Expected: (normInf) <= (lInf), actual: 0.00153041 vs 0.0005
} else {
default_l1 = 9e-5; // Expected: (normL1) <= (l1), actual: 8.80073e-05 vs 1e-05
default_lInf = 0.0005; // Expected: (normInf) <= (lInf), actual: 0.000455521 vs 0.0001
}
}
}
#ifdef HAVE_HALIDE
else if (backend == DNN_BACKEND_HALIDE)
@ -1146,6 +1167,15 @@ TEST_P(Test_ONNX_conformance, Layer_Test)
{
applyTestTag(CV_TEST_TAG_DNN_SKIP_VULKAN, CV_TEST_TAG_DNN_SKIP_ONNX_CONFORMANCE);
}
if (name == "test_gelu_tanh_1") {
default_l1 = 0.00011; // Expected: (normL1) <= (l1), actual: 0.000101805 vs 1e-05
default_lInf = 0.00016; // Expected: (normInf) <= (lInf), actual: 0.000152707 vs 0.0001
}
if (name == "test_gelu_tanh_2") {
default_l1 = 9e-5; // Expected: (normL1) <= (l1), actual: 8.80073e-05 vs 1e-05
default_lInf = 0.0005; // Expected: (normInf) <= (lInf), actual: 0.000455521 vs 0.0001
}
}
#endif
#ifdef HAVE_CUDA
@ -1159,6 +1189,20 @@ TEST_P(Test_ONNX_conformance, Layer_Test)
{
applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16, CV_TEST_TAG_DNN_SKIP_ONNX_CONFORMANCE);
}
if (name == "test_gelu_tanh_1") {
default_l1 = 0.00011; // Expected: (normL1) <= (l1), actual: 0.000101815 vs 1e-05
default_lInf = 0.00016; // Expected: (normInf) <= (lInf), actual: 0.000152737 vs 0.0001
}
if (name == "test_gelu_tanh_2") {
if (target == DNN_TARGET_CUDA_FP16) {
default_l1 = 0.00023; // Expected: (normL1) <= (l1), actual: 0.000220591 vs 9e-05
default_lInf = 0.0023; // Expected: (normInf) <= (lInf), actual: 0.00220466 vs 0.0005
} else {
default_l1 = 9e-5; // Expected: (normL1) <= (l1), actual: 8.80127e-05 vs 1e-05
default_lInf = 0.0005; // Expected: (normInf) <= (lInf), actual: 0.000455445 vs 0.0001
}
}
}
#endif
else

@ -688,6 +688,36 @@ CASE(test_gathernd_example_int32)
// no filter
CASE(test_gathernd_example_int32_batch_dim1)
// no filter
CASE(test_gelu_default_1)
// no filter
CASE(test_gelu_default_1_expanded)
// no filter
CASE(test_gelu_default_2)
// no filter
CASE(test_gelu_default_2_expanded)
// no filter
CASE(test_gelu_tanh_1)
if (target == DNN_TARGET_CPU) {
default_l1 = 0.00011; // Expected: (normL1) <= (l1), actual: 0.000101805 vs 1e-05
default_lInf = 0.00016; // Expected: (normInf) <= (lInf), actual: 0.000152707 vs 0.0001
}
if (target == DNN_TARGET_OPENCL) {
default_l1 = 0.00011; // Expected: (normL1) <= (l1), actual: 0.000101815 vs 1e-05
default_lInf = 0.00016; // Expected: (normInf) <= (lInf), actual: 0.000152737 vs 0.0001
}
CASE(test_gelu_tanh_1_expanded)
// no filter
CASE(test_gelu_tanh_2)
if (target == DNN_TARGET_CPU) {
default_l1 = 9e-5; // Expected: (normL1) <= (l1), actual: 8.80057e-05 vs 1e-05
default_lInf = 0.00046; // Expected: (normInf) <= (lInf), actual: 0.000455521 vs 0.0001
}
if (target == DNN_TARGET_OPENCL) {
default_l1 = 9e-5; // Expected: (normL1) <= (l1), actual: 8.80144e-05 vs 1e-05
default_lInf = 0.00046; // Expected: (normInf) <= (lInf), actual: 0.000455445 vs 0.0001
}
CASE(test_gelu_tanh_2_expanded)
// no filter
CASE(test_gemm_all_attributes)
// no filter
CASE(test_gemm_alpha)

@ -117,6 +117,10 @@
"test_gathernd_example_float32",
"test_gathernd_example_int32",
"test_gathernd_example_int32_batch_dim1",
"test_gelu_default_1_expanded", // parser: no corresponding layer for CastLike
"test_gelu_default_2_expanded", // parser: no corresponding layer for CastLike
"test_gelu_tanh_1_expanded", // parser: no corresponding layer for CastLike
"test_gelu_tanh_2_expanded", // parser: no corresponding layer for CastLike
"test_gemm_all_attributes",
"test_gemm_alpha",
"test_gemm_beta",

@ -3149,6 +3149,15 @@ TEST_P(Test_ONNX_nets, ViT_B_32) {
l1 = 0.008;
lInf = 0.04;
}
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) {
if (target == DNN_TARGET_CPU) {
l1 = 4.4e-5; // Expected: (normL1) <= (l1), actual: 4.31208e-05 vs 1e-05
lInf = 0.0002; // Expected: (normInf) <= (lInf), actual: 0.000194907 vs 0.0001
} else if (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16) {
l1 = 0.0092; // Expected: (normL1) <= (l1), actual: 0.00918349 vs 4.4e-05
lInf = 0.056; // Expected: (normInf) <= (lInf), actual: 0.0556431 vs 0.0002
}
}
normAssert(ref, out, "ViTB_32", l1, lInf);
}

Loading…
Cancel
Save