From 4a54aa3fbd875995e876bb87e58e0a4fd456e452 Mon Sep 17 00:00:00 2001 From: Vitaly Tuzov Date: Mon, 22 Apr 2019 10:35:37 +0300 Subject: [PATCH] Cleared up deprecated intrinsics for FP16 --- .../include/opencv2/core/hal/intrin_avx.hpp | 21 ++-------- .../include/opencv2/core/hal/intrin_neon.hpp | 42 ------------------- .../include/opencv2/core/hal/intrin_sse.hpp | 22 ++++------ 3 files changed, 13 insertions(+), 72 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index 58db71467d..99271e9980 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -431,19 +431,6 @@ inline v_float64x4 v_reinterpret_as_f64(const v_float64x4& a) inline v_float64x4 v_reinterpret_as_f64(const v_float32x8& a) { return v_float64x4(_mm256_castps_pd(a.val)); } -#if CV_FP16 -inline v_float32x8 v256_load_fp16_f32(const short* ptr) -{ - return v_float32x8(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr))); -} - -inline void v_store_fp16(short* ptr, const v_float32x8& a) -{ - __m128i fp16_value = _mm256_cvtps_ph(a.val, 0); - _mm_store_si128((__m128i*)ptr, fp16_value); -} -#endif - /* Recombine */ /*#define OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, perm) \ inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \ @@ -1400,7 +1387,7 @@ inline v_float32x8 v_cvt_f32(const v_float64x4& a) inline v_float32x8 v_cvt_f32(const v_float64x4& a, const v_float64x4& b) { __m128 af = _mm256_cvtpd_ps(a.val), bf = _mm256_cvtpd_ps(b.val); - return v_float32x8(_mm256_insertf128_ps(_mm256_castps128_ps256(af), bf, 1)); + return v_float32x8(_v256_combine(af, bf)); } inline v_float64x4 v_cvt_f64(const v_int32x8& a) @@ -1474,7 +1461,7 @@ inline v_int32x8 v256_lut_pairs(const int* tab, const int* idx) } inline v_int32x8 v256_lut_quads(const int* tab, const int* idx) { - return v_int32x8(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(tab + idx[0]))), _mm_loadu_si128((const __m128i*)(tab + idx[1])), 0x1)); + return v_int32x8(_v256_combine(_mm_loadu_si128((const __m128i*)(tab + idx[0])), _mm_loadu_si128((const __m128i*)(tab + idx[1])))); } inline v_uint32x8 v256_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut((const int *)tab, idx)); } inline v_uint32x8 v256_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_pairs((const int *)tab, idx)); } @@ -1490,7 +1477,7 @@ inline v_int64x4 v256_lut(const int64* tab, const int* idx) } inline v_int64x4 v256_lut_pairs(const int64* tab, const int* idx) { - return v_int64x4(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(tab + idx[0]))), _mm_loadu_si128((const __m128i*)(tab + idx[1])), 0x1)); + return v_int64x4(_v256_combine(_mm_loadu_si128((const __m128i*)(tab + idx[0])), _mm_loadu_si128((const __m128i*)(tab + idx[1])))); } inline v_uint64x4 v256_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut((const int64 *)tab, idx)); } inline v_uint64x4 v256_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut_pairs((const int64 *)tab, idx)); } @@ -1506,7 +1493,7 @@ inline v_float64x4 v256_lut(const double* tab, const int* idx) { return v_float64x4(_mm256_i32gather_pd(tab, _mm_loadu_si128((const __m128i*)idx), 8)); } -inline v_float64x4 v256_lut_pairs(const double* tab, const int* idx) { return v_float64x4(_mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_loadu_pd(tab + idx[0])), _mm_loadu_pd(tab + idx[1]), 0x1)); } +inline v_float64x4 v256_lut_pairs(const double* tab, const int* idx) { return v_float64x4(_v256_combine(_mm_loadu_pd(tab + idx[0]), _mm_loadu_pd(tab + idx[1]))); } inline v_int32x8 v_lut(const int* tab, const v_int32x8& idxvec) { diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index 3b946ff7c6..c6da1b42d9 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -278,48 +278,6 @@ struct v_float64x2 }; #endif -#if CV_FP16 -// Workaround for old compilers -static inline int16x4_t vreinterpret_s16_f16(float16x4_t a) { return (int16x4_t)a; } -static inline float16x4_t vreinterpret_f16_s16(int16x4_t a) { return (float16x4_t)a; } - -static inline float16x4_t cv_vld1_f16(const void* ptr) -{ -#ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro - return vreinterpret_f16_s16(vld1_s16((const short*)ptr)); -#else - return vld1_f16((const __fp16*)ptr); -#endif -} -static inline void cv_vst1_f16(void* ptr, float16x4_t a) -{ -#ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro - vst1_s16((short*)ptr, vreinterpret_s16_f16(a)); -#else - vst1_f16((__fp16*)ptr, a); -#endif -} - -#ifndef vdup_n_f16 - #define vdup_n_f16(v) (float16x4_t){v, v, v, v} -#endif - -#endif // CV_FP16 - -#if CV_FP16 -inline v_float32x4 v128_load_fp16_f32(const short* ptr) -{ - float16x4_t a = cv_vld1_f16((const __fp16*)ptr); - return v_float32x4(vcvt_f32_f16(a)); -} - -inline void v_store_fp16(short* ptr, const v_float32x4& a) -{ - float16x4_t fp16 = vcvt_f16_f32(a.val); - cv_vst1_f16((short*)ptr, fp16); -} -#endif - #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \ inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \ inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \ diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index a5adad04c5..003270787f 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -2684,19 +2684,6 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) return v_float64x2(_mm_cvtps_pd(_mm_movehl_ps(a.val, a.val))); } -#if CV_FP16 -inline v_float32x4 v128_load_fp16_f32(const short* ptr) -{ - return v_float32x4(_mm_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr))); -} - -inline void v_store_fp16(short* ptr, const v_float32x4& a) -{ - __m128i fp16_value = _mm_cvtps_ph(a.val, 0); - _mm_storel_epi64((__m128i*)ptr, fp16_value); -} -#endif - ////////////// Lookup table access //////////////////// inline v_int8x16 v_lut(const schar* tab, const int* idx) @@ -2956,6 +2943,9 @@ inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; } inline v_float32x4 v_load_expand(const float16_t* ptr) { +#if CV_FP16 + return v_float32x4(_mm_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr))); +#else const __m128i z = _mm_setzero_si128(), delta = _mm_set1_epi32(0x38000000); const __m128i signmask = _mm_set1_epi32(0x80000000), maxexp = _mm_set1_epi32(0x7c000000); const __m128 deltaf = _mm_castsi128_ps(_mm_set1_epi32(0x38800000)); @@ -2968,10 +2958,15 @@ inline v_float32x4 v_load_expand(const float16_t* ptr) __m128i zmask = _mm_cmpeq_epi32(e, z); __m128i ft = v_select_si128(zmask, zt, t); return v_float32x4(_mm_castsi128_ps(_mm_or_si128(ft, sign))); +#endif } inline void v_pack_store(float16_t* ptr, const v_float32x4& v) { +#if CV_FP16 + __m128i fp16_value = _mm_cvtps_ph(v.val, 0); + _mm_storel_epi64((__m128i*)ptr, fp16_value); +#else const __m128i signmask = _mm_set1_epi32(0x80000000); const __m128i rval = _mm_set1_epi32(0x3f000000); @@ -2993,6 +2988,7 @@ inline void v_pack_store(float16_t* ptr, const v_float32x4& v) t = _mm_or_si128(t, sign); t = _mm_packs_epi32(t, t); _mm_storel_epi64((__m128i*)ptr, t); +#endif } inline void v_cleanup() {}