fix MSA sum overflow issue

3 years ago · 8b44ee2ce1
parent 9b2b2c88df
commit 8b44ee2ce1
2 changed files with 9 additions and 9 deletions
--- a/modules/core/include/opencv2/core/hal/intrin_msa.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_msa.hpp
@ -1037,12 +1037,12 @@ inline scalartype v_reduce_sum(const _Tpvec& a) \
    return (scalartype)msa_sum_##suffix(a.val); \
 }

-OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint8x16, unsigned char, u8)
-OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int8x16, char, s8)
-OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint16x8, unsigned short, u16)
-OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int16x8, short, s16)
-OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint32x4, unsigned, u32)
-OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint8x16, unsigned short, u8)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int8x16, short, s8)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint16x8, unsigned, u16)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int16x8, int, s16)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint32x4, uint64_t, u32)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int32x4, int64_t, s32)
 OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_float32x4, float, f32)

 inline uint64 v_reduce_sum(const v_uint64x2& a)
--- a/modules/core/include/opencv2/core/hal/msa_macros.h
+++ b/modules/core/include/opencv2/core/hal/msa_macros.h
@ -719,7 +719,7 @@ typedef double v1f64 __attribute__ ((vector_size(8), aligned(8)));
  v2i64 _c;                                     \
  _b = __builtin_msa_hadd_s_w(__a, __a);        \
  _c = __builtin_msa_hadd_s_d(_b, _b);          \
-  (int16_t)(_c[0] + _c[1]);                     \
+  (int32_t)(_c[0] + _c[1]);                     \
 })


@ -736,7 +736,7 @@ typedef double v1f64 __attribute__ ((vector_size(8), aligned(8)));
 ({                                             \
  v2i64 _b;                                    \
  _b = __builtin_msa_hadd_s_d(__a, __a);       \
-  (int32_t)(_b[0] + _b[1]);                    \
+  (int64_t)(_b[0] + _b[1]);                    \
 })

 /* uint8_t msa_sum_u8(v16u8 __a)*/
@ -756,7 +756,7 @@ typedef double v1f64 __attribute__ ((vector_size(8), aligned(8)));
  v4i32 _c32;                                    \
  _b16 = __builtin_msa_hadd_s_h(__a, __a);       \
  _c32 = __builtin_msa_hadd_s_w(_b16, _b16);         \
-  (int8_t)msa_sum_s32(_c32);                     \
+  (int16_t)msa_sum_s32(_c32);                     \
 })

 /* float msa_sum_f32(v4f32 __a)*/