Merge pull request #24271 from Kumataro:fix24163

Fix to convert float32 to int32/uint32 with rounding to nearest (ties to even). #24271 Fix https://github.com/opencv/opencv/issues/24163 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake (carotene is BSD)
1 year ago · dba7186378
parent d9d402916a
commit dba7186378
13 changed files with 323 additions and 164 deletions
--- a/3rdparty/carotene/CMakeLists.txt
+++ b/3rdparty/carotene/CMakeLists.txt
@ -42,6 +42,14 @@ endif()

 if(WITH_NEON)
    target_compile_definitions(carotene_objs PRIVATE "-DWITH_NEON")
+    if(NOT DEFINED CAROTENE_NEON_ARCH )
+    elseif(CAROTENE_NEON_ARCH EQUAL 8)
+	    target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=8")
+    elseif(CAROTENE_NEON_ARCH EQUAL 7)
+	    target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=7")
+    else()
+	    target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=0")
+    endif()
 endif()

 # we add dummy file to fix XCode build
--- a/3rdparty/carotene/src/add_weighted.cpp
+++ b/3rdparty/carotene/src/add_weighted.cpp
@ -39,6 +39,7 @@

 #include "common.hpp"
 #include "vtransform.hpp"
+#include "vround_helper.hpp"

 namespace CAROTENE_NS {

@ -106,7 +107,7 @@ template <> struct wAdd<s32>
    {
        valpha = vdupq_n_f32(_alpha);
        vbeta = vdupq_n_f32(_beta);
-        vgamma = vdupq_n_f32(_gamma + 0.5);
+        vgamma = vdupq_n_f32(_gamma);
    }

    void operator() (const VecTraits<s32>::vec128 & v_src0,
@ -118,7 +119,7 @@ template <> struct wAdd<s32>

        vs1 = vmlaq_f32(vgamma, vs1, valpha);
        vs1 = vmlaq_f32(vs1, vs2, vbeta);
-        v_dst = vcvtq_s32_f32(vs1);
+        v_dst = vroundq_s32_f32(vs1);
    }

    void operator() (const VecTraits<s32>::vec64 & v_src0,
@ -130,7 +131,7 @@ template <> struct wAdd<s32>

        vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
        vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
-        v_dst = vcvt_s32_f32(vs1);
+        v_dst = vround_s32_f32(vs1);
    }

    void operator() (const s32 * src0, const s32 * src1, s32 * dst) const
@ -150,7 +151,7 @@ template <> struct wAdd<u32>
    {
        valpha = vdupq_n_f32(_alpha);
        vbeta = vdupq_n_f32(_beta);
-        vgamma = vdupq_n_f32(_gamma + 0.5);
+        vgamma = vdupq_n_f32(_gamma);
    }

    void operator() (const VecTraits<u32>::vec128 & v_src0,
@ -162,7 +163,7 @@ template <> struct wAdd<u32>

        vs1 = vmlaq_f32(vgamma, vs1, valpha);
        vs1 = vmlaq_f32(vs1, vs2, vbeta);
-        v_dst = vcvtq_u32_f32(vs1);
+        v_dst = vroundq_u32_f32(vs1);
    }

    void operator() (const VecTraits<u32>::vec64 & v_src0,
@ -174,7 +175,7 @@ template <> struct wAdd<u32>

        vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
        vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
-        v_dst = vcvt_u32_f32(vs1);
+        v_dst = vround_u32_f32(vs1);
    }

    void operator() (const u32 * src0, const u32 * src1, u32 * dst) const
--- a/3rdparty/carotene/src/blur.cpp
+++ b/3rdparty/carotene/src/blur.cpp
@ -41,6 +41,7 @@

 #include "common.hpp"
 #include "saturate_cast.hpp"
+#include "vround_helper.hpp"

 namespace CAROTENE_NS {

@ -198,7 +199,6 @@ void blur3x3(const Size2D &size, s32 cn,
 //#define FLOAT_VARIANT_1_9
 #ifdef FLOAT_VARIANT_1_9
    float32x4_t v1_9 = vdupq_n_f32 (1.0/9.0);
-    float32x4_t v0_5 = vdupq_n_f32 (.5);
 #else
    const int16x8_t vScale = vmovq_n_s16(3640);
 #endif
@ -283,8 +283,8 @@ void blur3x3(const Size2D &size, s32 cn,
                uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
                float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1));
                float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2));
-                tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
-                tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
+                tres1 = internal::vroundq_u32_f32(vf1);
+                tres2 = internal::vroundq_u32_f32(vf2);
                t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
                vst1_u8(drow + x - 8, vmovn_u16(t0));
 #else
@ -445,8 +445,8 @@ void blur3x3(const Size2D &size, s32 cn,
                uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
                float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1));
                float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2));
-                tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
-                tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
+                tres1 = internal::vroundq_u32_f32(vf1);
+                tres2 = internal::vroundq_u32_f32(vf2);
                t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
                vst1_u8(drow + x - 8, vmovn_u16(t0));
 #else
@ -508,7 +508,6 @@ void blur5x5(const Size2D &size, s32 cn,
 #define FLOAT_VARIANT_1_25
 #ifdef FLOAT_VARIANT_1_25
    float32x4_t v1_25 = vdupq_n_f32 (1.0f/25.0f);
-    float32x4_t v0_5 = vdupq_n_f32 (.5f);
 #else
    const int16x8_t vScale = vmovq_n_s16(1310);
 #endif
@ -752,8 +751,8 @@ void blur5x5(const Size2D &size, s32 cn,
            uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
            float32x4_t vf1 = vmulq_f32(v1_25, vcvtq_f32_u32(tres1));
            float32x4_t vf2 = vmulq_f32(v1_25, vcvtq_f32_u32(tres2));
-            tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
-            tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
+            tres1 = internal::vroundq_u32_f32(vf1);
+            tres2 = internal::vroundq_u32_f32(vf2);
            t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
            vst1_u8(drow + x - 8, vmovn_u16(t0));
 #else
--- a/3rdparty/carotene/src/colorconvert.cpp
+++ b/3rdparty/carotene/src/colorconvert.cpp
@ -40,6 +40,7 @@
 #include "common.hpp"

 #include "saturate_cast.hpp"
+#include "vround_helper.hpp"

 namespace CAROTENE_NS {

@ -1166,17 +1167,10 @@ inline uint8x8x3_t convertToHSV(const uint8x8_t vR, const uint8x8_t vG, const ui
    vSt3 = vmulq_f32(vHF1, vDivTab);
    vSt4 = vmulq_f32(vHF2, vDivTab);

-    float32x4_t bias = vdupq_n_f32(0.5f);
-
-    vSt1 = vaddq_f32(vSt1, bias);
-    vSt2 = vaddq_f32(vSt2, bias);
-    vSt3 = vaddq_f32(vSt3, bias);
-    vSt4 = vaddq_f32(vSt4, bias);
-
-    uint32x4_t vRes1 = vcvtq_u32_f32(vSt1);
-    uint32x4_t vRes2 = vcvtq_u32_f32(vSt2);
-    uint32x4_t vRes3 = vcvtq_u32_f32(vSt3);
-    uint32x4_t vRes4 = vcvtq_u32_f32(vSt4);
+    uint32x4_t vRes1 = internal::vroundq_u32_f32(vSt1);
+    uint32x4_t vRes2 = internal::vroundq_u32_f32(vSt2);
+    uint32x4_t vRes3 = internal::vroundq_u32_f32(vSt3);
+    uint32x4_t vRes4 = internal::vroundq_u32_f32(vSt4);

    int32x4_t vH_L = vmovl_s16(vget_low_s16(vDiff4));
    int32x4_t vH_H = vmovl_s16(vget_high_s16(vDiff4));
--- a/3rdparty/carotene/src/common.hpp
+++ b/3rdparty/carotene/src/common.hpp
@ -58,6 +58,17 @@

 namespace CAROTENE_NS { namespace internal {

+#ifndef CAROTENE_NEON_ARCH
+#    if defined(__aarch64__) || defined(__aarch32__)
+#        define CAROTENE_NEON_ARCH 8
+#    else
+#        define CAROTENE_NEON_ARCH 7
+#    endif
+#endif
+#if ( !defined(__aarch64__) && !defined(__aarch32__) ) && (CAROTENE_NEON_ARCH == 8 )
+#    error("ARMv7 doen't support A32/A64 Neon instructions")
+#endif
+
 inline void prefetch(const void *ptr, size_t offset = 32*10)
 {
 #if defined __GNUC__
--- a/3rdparty/carotene/src/convert_scale.cpp
+++ b/3rdparty/carotene/src/convert_scale.cpp
@ -38,6 +38,7 @@
 */

 #include "common.hpp"
+#include "vround_helper.hpp"

 namespace CAROTENE_NS {

@ -185,7 +186,7 @@ CVTS_FUNC1(u8, 16,
 #else
 CVTS_FUNC1(u8, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 16)
    {
@ -209,10 +210,10 @@ CVTS_FUNC1(u8, 16,
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
        vline3_f32 = vaddq_f32(vline3_f32, vshift);
        vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
-        int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
-        int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
+        int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
+        int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
        uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
        uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
        vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16)));
@ -270,7 +271,7 @@ CVTS_FUNC(u8, s8, 16,
 #else
 CVTS_FUNC(u8, s8, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 16)
    {
@ -294,10 +295,10 @@ CVTS_FUNC(u8, s8, 16,
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
        vline3_f32 = vaddq_f32(vline3_f32, vshift);
        vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
-        int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
-        int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
+        int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
+        int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
        int16x8_t vRes1_u16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
        int16x8_t vRes2_u16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
        vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_u16), vqmovn_s16(vRes2_u16)));
@ -355,7 +356,7 @@ CVTS_FUNC(u8, u16, 16,
 #else
 CVTS_FUNC(u8, u16, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 16)
    {
@ -379,10 +380,10 @@ CVTS_FUNC(u8, u16, 16,
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
        vline3_f32 = vaddq_f32(vline3_f32, vshift);
        vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
-        int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
-        int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
+        int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
+        int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
        vst1q_u16(_dst + i + 0, vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32)));
        vst1q_u16(_dst + i + 8, vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32)));
    }
@ -439,7 +440,7 @@ CVTS_FUNC(u8, s16, 16,
 #else
 CVTS_FUNC(u8, s16, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 16)
    {
@ -463,10 +464,10 @@ CVTS_FUNC(u8, s16, 16,
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
        vline3_f32 = vaddq_f32(vline3_f32, vshift);
        vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
-        int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
-        int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
+        int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
+        int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
        vst1q_s16(_dst + i + 0, vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32)));
        vst1q_s16(_dst + i + 8, vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32)));
    }
@ -526,7 +527,7 @@ CVTS_FUNC(u8, s32, 16,
 #else
 CVTS_FUNC(u8, s32, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 16)
    {
@ -550,10 +551,10 @@ CVTS_FUNC(u8, s32, 16,
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
        vline3_f32 = vaddq_f32(vline3_f32, vshift);
        vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
-        int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
-        int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
+        int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
+        int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
        vst1q_s32(_dst + i + 0,  vline1_s32);
        vst1q_s32(_dst + i + 4,  vline2_s32);
        vst1q_s32(_dst + i + 8,  vline3_s32);
@ -693,7 +694,7 @@ CVTS_FUNC(s8, u8, 16,
 #else
 CVTS_FUNC(s8, u8, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 16)
    {
@ -717,10 +718,10 @@ CVTS_FUNC(s8, u8, 16,
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
        vline3_f32 = vaddq_f32(vline3_f32, vshift);
        vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
-        vline3_s32 = vcvtq_s32_f32(vline3_f32);
-        vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
+        vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
+        vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
        uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
        uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
        vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16)));
@ -778,7 +779,7 @@ CVTS_FUNC1(s8, 16,
 #else
 CVTS_FUNC1(s8, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 16)
    {
@ -802,10 +803,10 @@ CVTS_FUNC1(s8, 16,
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
        vline3_f32 = vaddq_f32(vline3_f32, vshift);
        vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
-        vline3_s32 = vcvtq_s32_f32(vline3_f32);
-        vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
+        vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
+        vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
        int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
        int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
        vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_s16), vqmovn_s16(vRes2_s16)));
@ -863,7 +864,7 @@ CVTS_FUNC(s8, u16, 16,
 #else
 CVTS_FUNC(s8, u16, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 16)
    {
@ -887,10 +888,10 @@ CVTS_FUNC(s8, u16, 16,
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
        vline3_f32 = vaddq_f32(vline3_f32, vshift);
        vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
-        vline3_s32 = vcvtq_s32_f32(vline3_f32);
-        vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
+        vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
+        vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
        uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
        uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
        vst1q_u16(_dst + i + 0, vRes1_u16);
@ -949,7 +950,7 @@ CVTS_FUNC(s8, s16, 16,
 #else
 CVTS_FUNC(s8, s16, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 16)
    {
@ -973,10 +974,10 @@ CVTS_FUNC(s8, s16, 16,
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
        vline3_f32 = vaddq_f32(vline3_f32, vshift);
        vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
-        vline3_s32 = vcvtq_s32_f32(vline3_f32);
-        vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
+        vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
+        vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
        int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
        int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
        vst1q_s16(_dst + i + 0, vRes1_s16);
@ -1038,7 +1039,7 @@ CVTS_FUNC(s8, s32, 16,
 #else
 CVTS_FUNC(s8, s32, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 16)
    {
@ -1062,10 +1063,10 @@ CVTS_FUNC(s8, s32, 16,
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
        vline3_f32 = vaddq_f32(vline3_f32, vshift);
        vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
-        vline3_s32 = vcvtq_s32_f32(vline3_f32);
-        vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
+        vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
+        vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
        vst1q_s32(_dst + i + 0,  vline1_s32);
        vst1q_s32(_dst + i + 4,  vline2_s32);
        vst1q_s32(_dst + i + 8,  vline3_s32);
@ -1190,7 +1191,7 @@ CVTS_FUNC(u16, u8, 16,
 #else
 CVTS_FUNC(u16, u8, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -1204,8 +1205,8 @@ CVTS_FUNC(u16, u8, 16,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
        uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2));
@ -1249,7 +1250,7 @@ CVTS_FUNC(u16, s8, 16,
 #else
 CVTS_FUNC(u16, s8, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -1263,8 +1264,8 @@ CVTS_FUNC(u16, s8, 16,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
        int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
@ -1307,7 +1308,7 @@ CVTS_FUNC1(u16, 16,
 #else
 CVTS_FUNC1(u16, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -1321,8 +1322,8 @@ CVTS_FUNC1(u16, 16,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
        uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
        vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
@ -1364,7 +1365,7 @@ CVTS_FUNC(u16, s16, 8,
 #else
 CVTS_FUNC(u16, s16, 8,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -1378,8 +1379,8 @@ CVTS_FUNC(u16, s16, 8,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
        vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
@ -1421,7 +1422,7 @@ CVTS_FUNC(u16, s32, 8,
 #else
 CVTS_FUNC(u16, s32, 8,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -1435,8 +1436,8 @@ CVTS_FUNC(u16, s32, 8,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        vst1q_s32(_dst + i + 0, vline1_s32);
        vst1q_s32(_dst + i + 4, vline2_s32);
    }
@ -1530,7 +1531,7 @@ CVTS_FUNC(s16, u8, 16,
 #else
 CVTS_FUNC(s16, u8, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -1544,8 +1545,8 @@ CVTS_FUNC(s16, u8, 16,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
        uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2));
@ -1589,7 +1590,7 @@ CVTS_FUNC(s16, s8, 16,
 #else
 CVTS_FUNC(s16, s8, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -1603,8 +1604,8 @@ CVTS_FUNC(s16, s8, 16,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
        int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
@ -1647,7 +1648,7 @@ CVTS_FUNC(s16, u16, 8,
 #else
 CVTS_FUNC(s16, u16, 8,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -1661,8 +1662,8 @@ CVTS_FUNC(s16, u16, 8,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
        uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
        vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
@ -1704,7 +1705,7 @@ CVTS_FUNC1(s16, 16,
 #else
 CVTS_FUNC1(s16, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -1718,8 +1719,8 @@ CVTS_FUNC1(s16, 16,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
        vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
@ -1761,7 +1762,7 @@ CVTS_FUNC(s16, s32, 8,
 #else
 CVTS_FUNC(s16, s32, 8,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -1775,8 +1776,8 @@ CVTS_FUNC(s16, s32, 8,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        vst1q_s32(_dst + i + 0, vline1_s32);
        vst1q_s32(_dst + i + 4, vline2_s32);
    }
@ -1870,7 +1871,7 @@ CVTS_FUNC(s32, u8, 8,
 #else
 CVTS_FUNC(s32, u8, 8,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -1883,8 +1884,8 @@ CVTS_FUNC(s32, u8, 8,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
        uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
        uint8x8_t vRes = vqmovn_u16(vcombine_u16(vRes1, vRes2));
@ -1928,7 +1929,7 @@ CVTS_FUNC(s32, s8, 8,
 #else
 CVTS_FUNC(s32, s8, 8,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -1941,8 +1942,8 @@ CVTS_FUNC(s32, s8, 8,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
        int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
@ -1985,7 +1986,7 @@ CVTS_FUNC(s32, u16, 8,
 #else
 CVTS_FUNC(s32, u16, 8,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -1998,8 +1999,8 @@ CVTS_FUNC(s32, u16, 8,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
        uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
        vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
@ -2041,7 +2042,7 @@ CVTS_FUNC(s32, s16, 8,
 #else
 CVTS_FUNC(s32, s16, 8,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -2054,8 +2055,8 @@ CVTS_FUNC(s32, s16, 8,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
        vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
@ -2097,7 +2098,7 @@ CVTS_FUNC1(s32, 8,
 #else
 CVTS_FUNC1(s32, 8,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -2110,8 +2111,8 @@ CVTS_FUNC1(s32, 8,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        vst1q_s32(_dst + i + 0, vline1_s32);
        vst1q_s32(_dst + i + 4, vline2_s32);
    }
@ -2272,7 +2273,7 @@ CVTS_FUNC(f32, s8, 8,
 #else
 CVTS_FUNC(f32, s8, 8,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -2283,8 +2284,8 @@ CVTS_FUNC(f32, s8, 8,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
        int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
@ -2325,7 +2326,7 @@ CVTS_FUNC(f32, u16, 8,
 #else
 CVTS_FUNC(f32, u16, 8,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -2336,8 +2337,8 @@ CVTS_FUNC(f32, u16, 8,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1_f32);
-        uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2_f32);
+        uint32x4_t vline1_u32 = internal::vroundq_u32_f32(vline1_f32);
+        uint32x4_t vline2_u32 = internal::vroundq_u32_f32(vline2_f32);
        uint16x4_t vRes1 = vqmovn_u32(vline1_u32);
        uint16x4_t vRes2 = vqmovn_u32(vline2_u32);
        vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
@ -2377,7 +2378,7 @@ CVTS_FUNC(f32, s16, 8,
 #else
 CVTS_FUNC(f32, s16, 8,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -2388,8 +2389,8 @@ CVTS_FUNC(f32, s16, 8,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
        vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
@ -2429,7 +2430,7 @@ CVTS_FUNC(f32, s32, 8,
 #else
 CVTS_FUNC(f32, s32, 8,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -2440,8 +2441,8 @@ CVTS_FUNC(f32, s32, 8,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        vst1q_s32(_dst + i + 0, vline1_s32);
        vst1q_s32(_dst + i + 4, vline2_s32);
    }
--- a/3rdparty/carotene/src/div.cpp
+++ b/3rdparty/carotene/src/div.cpp
@ -39,6 +39,7 @@

 #include "common.hpp"
 #include "vtransform.hpp"
+#include "vround_helper.hpp"

 #include <cstring>
 #include <cfloat>
@ -51,13 +52,6 @@ namespace {

 #ifdef CAROTENE_NEON

-inline float32x4_t vroundq(const float32x4_t& v)
-{
-    const int32x4_t signMask = vdupq_n_s32(1 << 31), half = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
-    float32x4_t v_addition = vreinterpretq_f32_s32(vorrq_s32(half, vandq_s32(signMask, vreinterpretq_s32_f32(v))));
-    return vaddq_f32(v, v_addition);
-}
-
 template <typename T>
 inline T divSaturateQ(const T &v1, const T &v2, const float scale)
 {
@ -69,17 +63,10 @@ inline T divSaturateQ(const T &v1, const T &v2, const float scale)
 }
 template <>
 inline int32x4_t divSaturateQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
-{ return vcvtq_s32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2))))); }
+{ return internal::vroundq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); }
 template <>
 inline uint32x4_t divSaturateQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
-{ return vcvtq_u32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2))))); }
-
-inline float32x2_t vround(const float32x2_t& v)
-{
-    const int32x2_t signMask = vdup_n_s32(1 << 31), half = vreinterpret_s32_f32(vdup_n_f32(0.5f));
-    float32x2_t v_addition = vreinterpret_f32_s32(vorr_s32(half, vand_s32(signMask, vreinterpret_s32_f32(v))));
-    return vadd_f32(v, v_addition);
-}
+{ return internal::vroundq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); }

 template <typename T>
 inline T divSaturate(const T &v1, const T &v2, const float scale)
@ -88,10 +75,10 @@ inline T divSaturate(const T &v1, const T &v2, const float scale)
 }
 template <>
 inline int32x2_t divSaturate<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
-{ return vcvt_s32_f32(vround(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2))))); }
+{ return internal::vround_s32_f32(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2)))); }
 template <>
 inline uint32x2_t divSaturate<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
-{ return vcvt_u32_f32(vround(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2))))); }
+{ return internal::vround_u32_f32(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2)))); }


 template <typename T>
--- a/3rdparty/carotene/src/phase.cpp
+++ b/3rdparty/carotene/src/phase.cpp
@ -41,6 +41,7 @@
 #include <cmath>

 #include "common.hpp"
+#include "vround_helper.hpp"

 namespace CAROTENE_NS {

@ -121,8 +122,6 @@ void phase(const Size2D &size,
    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;

-    float32x4_t v_05 = vdupq_n_f32(0.5f);
-
    for (size_t i = 0; i < size.height; ++i)
    {
        const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
@ -149,8 +148,8 @@ void phase(const Size2D &size,
            float32x4_t v_dst32f1;
            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)

-            uint16x8_t v_dst16s0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
-                                                vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
+            uint16x8_t v_dst16s0 = vcombine_u16(vmovn_u32(internal::vroundq_u32_f32(v_dst32f0)),
+                                                vmovn_u32(internal::vroundq_u32_f32(v_dst32f1)));

            // 1
            v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src01)));
@ -161,8 +160,8 @@ void phase(const Size2D &size,
            v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src11)));
            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)

-            uint16x8_t v_dst16s1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
-                                                vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
+            uint16x8_t v_dst16s1 = vcombine_u16(vmovn_u32(internal::vroundq_u32_f32(v_dst32f0)),
+                                                vmovn_u32(internal::vroundq_u32_f32(v_dst32f1)));

            vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst16s0),
                                          vmovn_u16(v_dst16s1)));
@ -182,8 +181,8 @@ void phase(const Size2D &size,
            float32x4_t v_dst32f1;
            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)

-            uint16x8_t v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
-                                            vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
+            uint16x8_t v_dst = vcombine_u16(vmovn_u32(internal::vroundq_u32_f32(v_dst32f0)),
+                                            vmovn_u32(internal::vroundq_u32_f32(v_dst32f1)));

            vst1_u8(dst + j, vmovn_u16(v_dst));
        }
--- a/3rdparty/carotene/src/vround_helper.hpp
+++ b/3rdparty/carotene/src/vround_helper.hpp
@ -0,0 +1,102 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#ifndef CAROTENE_SRC_VROUND_HELPER_HPP
+#define CAROTENE_SRC_VROUND_HELPER_HPP
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+#ifdef CAROTENE_NEON
+
+/**
+ * This helper header is for rounding from float32xN to uin32xN or int32xN to nearest, ties to even.
+ * See https://en.wikipedia.org/wiki/Rounding#Rounding_half_to_even
+ */
+
+// See https://github.com/opencv/opencv/pull/24271#issuecomment-1867318007
+#define CAROTENE_ROUND_DELTA (12582912.0f)
+
+namespace CAROTENE_NS { namespace internal {
+
+inline uint32x4_t vroundq_u32_f32(const float32x4_t val)
+{
+#if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */
+    return vcvtnq_u32_f32(val);
+#else
+    const float32x4_t delta = vdupq_n_f32(CAROTENE_ROUND_DELTA);
+    return vcvtq_u32_f32(vsubq_f32(vaddq_f32(val, delta), delta));
+#endif
+}
+
+inline uint32x2_t vround_u32_f32(const float32x2_t val)
+{
+#if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */
+    return vcvtn_u32_f32(val);
+#else
+    const float32x2_t delta = vdup_n_f32(CAROTENE_ROUND_DELTA);
+    return vcvt_u32_f32(vsub_f32(vadd_f32(val, delta), delta));
+#endif
+}
+
+inline int32x4_t vroundq_s32_f32(const float32x4_t val)
+{
+#if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */
+    return vcvtnq_s32_f32(val);
+#else
+    const float32x4_t delta = vdupq_n_f32(CAROTENE_ROUND_DELTA);
+    return vcvtq_s32_f32(vsubq_f32(vaddq_f32(val, delta), delta));
+#endif
+}
+
+inline int32x2_t vround_s32_f32(const float32x2_t val)
+{
+#if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */
+    return vcvtn_s32_f32(val);
+#else
+    const float32x2_t delta = vdup_n_f32(CAROTENE_ROUND_DELTA);
+    return vcvt_s32_f32(vsub_f32(vadd_f32(val, delta), delta));
+#endif
+}
+
+} }
+
+#endif // CAROTENE_NEON
+
+#endif
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -915,7 +915,15 @@ foreach(hal ${OpenCV_HAL})
    if(";${CPU_BASELINE_FINAL};" MATCHES ";NEON;")
      add_subdirectory(3rdparty/carotene/hal)
      ocv_hal_register(CAROTENE_HAL_LIBRARIES CAROTENE_HAL_HEADERS CAROTENE_HAL_INCLUDE_DIRS)
-      list(APPEND OpenCV_USED_HAL "carotene (ver ${CAROTENE_HAL_VERSION})")
+
+      if( NOT DEFINED CAROTENE_NEON_ARCH)
+          set(CAROTENE_NEON_MSG "Auto detected")
+      elseif( CAROTENE_NEON_ARCH GREATER 7)
+          set(CAROTENE_NEON_MSG "Force ARMv8+")
+      else()
+          set(CAROTENE_NEON_MSG "Force ARMv7")
+      endif()
+      list(APPEND OpenCV_USED_HAL "carotene (ver ${CAROTENE_HAL_VERSION}, ${CAROTENE_NEON_MSG})")
    else()
      message(STATUS "Carotene: NEON is not available, disabling carotene...")
    endif()
--- a/doc/tutorials/introduction/config_reference/config_reference.markdown
+++ b/doc/tutorials/introduction/config_reference/config_reference.markdown
@ -585,6 +585,7 @@ Following options can be used to change installation layout for common scenarios
 | `BUILD_FAT_JAVA_LIB` | _ON_ (for static Android builds) | Build single _opencv_java_ dynamic library containing all library functionality bundled with Java bindings. |
 | `BUILD_opencv_python2` | _ON_ | Build python2 bindings (deprecated). Python with development files and numpy must be installed. |
 | `BUILD_opencv_python3` | _ON_ | Build python3 bindings. Python with development files and numpy must be installed. |
+| `CAROTENE_NEON_ARCH` | '(auto)' | Switch NEON Arch for Carotene. If it sets nothing, it will be auto-detected. If it sets 8, ARMv8(and later) is used. Otherwise, ARMv7 is used. |

 TODO: need separate tutorials covering bindings builds

--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@ -1990,11 +1990,9 @@ inline v_int32x4 v_round(const v_float32x4& a)
 #else
 inline v_int32x4 v_round(const v_float32x4& a)
 {
-    static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
-        v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
-
-    int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
-    return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
+    // See https://github.com/opencv/opencv/pull/24271#issuecomment-1867318007
+    float32x4_t delta = vdupq_n_f32(12582912.0f);
+    return v_int32x4(vcvtq_s32_f32(vsubq_f32(vaddq_f32(a.val, delta), delta)));
 }
 #endif
 inline v_int32x4 v_floor(const v_float32x4& a)
--- a/modules/core/test/test_operations.cpp
+++ b/modules/core/test/test_operations.cpp
@ -1570,4 +1570,54 @@ TEST(Core_Arithm, scalar_handling_19599)  // https://github.com/opencv/opencv/is
    EXPECT_EQ(1, c.rows);
 }

+// https://github.com/opencv/opencv/issues/24163
+typedef tuple<perf::MatDepth,int,int,int> Arith_Regression24163Param;
+typedef testing::TestWithParam<Arith_Regression24163Param> Core_Arith_Regression24163;
+
+#if defined __riscv
+TEST_P(Core_Arith_Regression24163, DISABLED_test_for_ties_to_even)
+#else
+TEST_P(Core_Arith_Regression24163, test_for_ties_to_even)
+#endif
+{
+    const int matDepth = get<0>(GetParam());
+    const int matHeight= get<1>(GetParam());
+    const int matWidth = 3; // Fixed
+    const int alpha    = get<2>(GetParam());
+    const int beta     = get<3>(GetParam());
+
+    // If alpha and/or beta are negative, and matDepth is unsigned, test is passed.
+    if( ( (alpha < 0) || (beta < 0) )
+        &&
+        ( (matDepth != CV_8S) && (matDepth != CV_16S) && (matDepth != CV_32S) ) )
+    {
+        throw SkipTestException( cv::format("Test is skipped(matDepth is not signed, alpha = %d, beta = %d)", alpha, beta) );
+    }
+
+    const int matType = CV_MAKE_TYPE(matDepth, 1);
+    const Size matSize(matWidth, matHeight);
+    const Mat src1(matSize, matType, Scalar(alpha,alpha,alpha,alpha));
+    const Mat src2(matSize, matType, Scalar(beta, beta, beta, beta));
+    const Mat result = ( src1 + src2 ) / 2;
+
+    // Expected that default is FE_TONEAREST(Ties to Even).
+    const int mean = lrint( static_cast<double>(alpha + beta) / 2.0 );
+    const Mat expected(matSize, matType, Scalar(mean,mean,mean,mean));
+
+    // Compare result and extected.
+    ASSERT_EQ(expected.size(), result.size());
+    EXPECT_EQ(0, cvtest::norm(expected, result, NORM_INF)) <<
+        "result=" << std::endl << result << std::endl <<
+        "expected=" << std::endl << expected;
+}
+
+INSTANTIATE_TEST_CASE_P(/* */, Core_Arith_Regression24163,
+    testing::Combine(
+        testing::Values(perf::MatDepth(CV_8U), CV_8S, CV_16U, CV_16S, CV_32S), // MatType
+        testing::Values( 3, 4, 5, 6),    // MatHeight
+        testing::Values(-2,-1, 0, 1, 2), // src1
+        testing::Values(   -1, 0, 1   )  // src2
+    )
+);
+
 }} // namespace