Merge pull request #13781 from terfendail:warp_wintr

Resize reworked using wide universal intrinsics (#13781)

* Added wide universal intrinsics optimized implementation for 3 channel bit-exact linear resize

* Reworked linear resize using new wide LUT intrinsics

* Fix for VSX intrinsics
pull/13884/head
Vitaly Tuzov 6 years ago committed by Alexander Alekhin
parent 8cedc052ca
commit 334c4d62b5
  1. 21
      modules/core/include/opencv2/core/hal/intrin.hpp
  2. 134
      modules/core/include/opencv2/core/hal/intrin_avx.hpp
  3. 73
      modules/core/include/opencv2/core/hal/intrin_cpp.hpp
  4. 225
      modules/core/include/opencv2/core/hal/intrin_neon.hpp
  5. 196
      modules/core/include/opencv2/core/hal/intrin_sse.hpp
  6. 130
      modules/core/include/opencv2/core/hal/intrin_vsx.hpp
  7. 268
      modules/imgproc/src/resize.cpp

@ -234,7 +234,12 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
inline vtyp vx_##loadsfx##_low(const typ* ptr) { return prefix##_##loadsfx##_low(ptr); } \
inline vtyp vx_##loadsfx##_halves(const typ* ptr0, const typ* ptr1) { return prefix##_##loadsfx##_halves(ptr0, ptr1); } \
inline void vx_store(typ* ptr, const vtyp& v) { return v_store(ptr, v); } \
inline void vx_store_aligned(typ* ptr, const vtyp& v) { return v_store_aligned(ptr, v); }
inline void vx_store_aligned(typ* ptr, const vtyp& v) { return v_store_aligned(ptr, v); } \
inline vtyp vx_lut(const typ* ptr, const int* idx) { return prefix##_lut(ptr, idx); } \
inline vtyp vx_lut_pairs(const typ* ptr, const int* idx) { return prefix##_lut_pairs(ptr, idx); }
#define CV_INTRIN_DEFINE_WIDE_LUT_QUAD(typ, vtyp, prefix) \
inline vtyp vx_lut_quads(const typ* ptr, const int* idx) { return prefix##_lut_quads(ptr, idx); }
#define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \
inline wtyp vx_load_expand(const typ* ptr) { return prefix##_load_expand(ptr); }
@ -244,6 +249,7 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
#define CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(typ, vtyp, short_typ, wtyp, qtyp, prefix, loadsfx) \
CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \
CV_INTRIN_DEFINE_WIDE_LUT_QUAD(typ, vtyp, prefix) \
CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \
CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix)
@ -251,14 +257,19 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(uchar, v_uint8, u8, v_uint16, v_uint32, prefix, load) \
CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(schar, v_int8, s8, v_int16, v_int32, prefix, load) \
CV_INTRIN_DEFINE_WIDE_INTRIN(ushort, v_uint16, u16, prefix, load) \
CV_INTRIN_DEFINE_WIDE_LUT_QUAD(ushort, v_uint16, prefix) \
CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(ushort, v_uint32, prefix) \
CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_int16, s16, prefix, load) \
CV_INTRIN_DEFINE_WIDE_LUT_QUAD(short, v_int16, prefix) \
CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(short, v_int32, prefix) \
CV_INTRIN_DEFINE_WIDE_INTRIN(int, v_int32, s32, prefix, load) \
CV_INTRIN_DEFINE_WIDE_LUT_QUAD(int, v_int32, prefix) \
CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(int, v_int64, prefix) \
CV_INTRIN_DEFINE_WIDE_INTRIN(unsigned, v_uint32, u32, prefix, load) \
CV_INTRIN_DEFINE_WIDE_LUT_QUAD(unsigned, v_uint32, prefix) \
CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(unsigned, v_uint64, prefix) \
CV_INTRIN_DEFINE_WIDE_INTRIN(float, v_float32, f32, prefix, load) \
CV_INTRIN_DEFINE_WIDE_LUT_QUAD(float, v_float32, prefix) \
CV_INTRIN_DEFINE_WIDE_INTRIN(int64, v_int64, s64, prefix, load) \
CV_INTRIN_DEFINE_WIDE_INTRIN(uint64, v_uint64, u64, prefix, load) \
CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(float16_t, v_float32, prefix)
@ -335,11 +346,11 @@ namespace CV__SIMD_NAMESPACE {
typedef v_uint64x4 v_uint64;
typedef v_int64x4 v_int64;
typedef v_float32x8 v_float32;
CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256)
#if CV_SIMD256_64F
typedef v_float64x4 v_float64;
#endif
CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256)
CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v256, load)
#endif
inline void vx_cleanup() { v256_cleanup(); }
} // namespace
using namespace CV__SIMD_NAMESPACE;
@ -358,11 +369,9 @@ namespace CV__SIMD_NAMESPACE {
typedef v_uint64x2 v_uint64;
typedef v_int64x2 v_int64;
typedef v_float32x4 v_float32;
#if CV_SIMD128_64F
typedef v_float64x2 v_float64;
#endif
CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v)
#if CV_SIMD128_64F
typedef v_float64x2 v_float64;
CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v, load)
#endif
inline void vx_cleanup() { v_cleanup(); }

@ -1417,6 +1417,97 @@ inline v_float64x4 v_cvt_f64_high(const v_float32x8& a)
////////////// Lookup table access ////////////////////
inline v_int8x32 v256_lut(const schar* tab, const int* idx)
{
return v_int8x32(_mm256_setr_epi8(tab[idx[ 0]], tab[idx[ 1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
tab[idx[ 8]], tab[idx[ 9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]],
tab[idx[16]], tab[idx[17]], tab[idx[18]], tab[idx[19]], tab[idx[20]], tab[idx[21]], tab[idx[22]], tab[idx[23]],
tab[idx[24]], tab[idx[25]], tab[idx[26]], tab[idx[27]], tab[idx[28]], tab[idx[29]], tab[idx[30]], tab[idx[31]]));
}
inline v_int8x32 v256_lut_pairs(const schar* tab, const int* idx)
{
return v_int8x32(_mm256_setr_epi16(*(const short*)(tab + idx[ 0]), *(const short*)(tab + idx[ 1]), *(const short*)(tab + idx[ 2]), *(const short*)(tab + idx[ 3]),
*(const short*)(tab + idx[ 4]), *(const short*)(tab + idx[ 5]), *(const short*)(tab + idx[ 6]), *(const short*)(tab + idx[ 7]),
*(const short*)(tab + idx[ 8]), *(const short*)(tab + idx[ 9]), *(const short*)(tab + idx[10]), *(const short*)(tab + idx[11]),
*(const short*)(tab + idx[12]), *(const short*)(tab + idx[13]), *(const short*)(tab + idx[14]), *(const short*)(tab + idx[15])));
}
inline v_int8x32 v256_lut_quads(const schar* tab, const int* idx)
{
return v_int8x32(_mm256_i32gather_epi32((const int*)tab, _mm256_loadu_si256((const __m256i*)idx), 1));
}
inline v_uint8x32 v256_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut((const schar *)tab, idx)); }
inline v_uint8x32 v256_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut_pairs((const schar *)tab, idx)); }
inline v_uint8x32 v256_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut_quads((const schar *)tab, idx)); }
inline v_int16x16 v256_lut(const short* tab, const int* idx)
{
return v_int16x16(_mm256_setr_epi16(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]));
}
inline v_int16x16 v256_lut_pairs(const short* tab, const int* idx)
{
return v_int16x16(_mm256_i32gather_epi32((const int*)tab, _mm256_loadu_si256((const __m256i*)idx), 2));
}
inline v_int16x16 v256_lut_quads(const short* tab, const int* idx)
{
#if defined(__GNUC__)
return v_int16x16(_mm256_i32gather_epi64((const long long int*)tab, _mm_loadu_si128((const __m128i*)idx), 2));//Looks like intrinsic has wrong definition
#else
return v_int16x16(_mm256_i32gather_epi64((const int64*)tab, _mm_loadu_si128((const __m128i*)idx), 2));
#endif
}
inline v_uint16x16 v256_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut((const short *)tab, idx)); }
inline v_uint16x16 v256_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut_pairs((const short *)tab, idx)); }
inline v_uint16x16 v256_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut_quads((const short *)tab, idx)); }
inline v_int32x8 v256_lut(const int* tab, const int* idx)
{
return v_int32x8(_mm256_i32gather_epi32(tab, _mm256_loadu_si256((const __m256i*)idx), 4));
}
inline v_int32x8 v256_lut_pairs(const int* tab, const int* idx)
{
#if defined(__GNUC__)
return v_int32x8(_mm256_i32gather_epi64((const long long int*)tab, _mm_loadu_si128((const __m128i*)idx), 4));
#else
return v_int32x8(_mm256_i32gather_epi64((const int64*)tab, _mm_loadu_si128((const __m128i*)idx), 4));
#endif
}
inline v_int32x8 v256_lut_quads(const int* tab, const int* idx)
{
return v_int32x8(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(tab + idx[0]))), _mm_loadu_si128((const __m128i*)(tab + idx[1])), 0x1));
}
inline v_uint32x8 v256_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut((const int *)tab, idx)); }
inline v_uint32x8 v256_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_pairs((const int *)tab, idx)); }
inline v_uint32x8 v256_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_quads((const int *)tab, idx)); }
inline v_int64x4 v256_lut(const int64* tab, const int* idx)
{
#if defined(__GNUC__)
return v_int64x4(_mm256_i32gather_epi64((const long long int*)tab, _mm_loadu_si128((const __m128i*)idx), 8));
#else
return v_int64x4(_mm256_i32gather_epi64(tab, _mm_loadu_si128((const __m128i*)idx), 8));
#endif
}
inline v_int64x4 v256_lut_pairs(const int64* tab, const int* idx)
{
return v_int64x4(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(tab + idx[0]))), _mm_loadu_si128((const __m128i*)(tab + idx[1])), 0x1));
}
inline v_uint64x4 v256_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut((const int64 *)tab, idx)); }
inline v_uint64x4 v256_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut_pairs((const int64 *)tab, idx)); }
inline v_float32x8 v256_lut(const float* tab, const int* idx)
{
return v_float32x8(_mm256_i32gather_ps(tab, _mm256_loadu_si256((const __m256i*)idx), 4));
}
inline v_float32x8 v256_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v256_lut_pairs((const int *)tab, idx)); }
inline v_float32x8 v256_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v256_lut_quads((const int *)tab, idx)); }
inline v_float64x4 v256_lut(const double* tab, const int* idx)
{
return v_float64x4(_mm256_i32gather_pd(tab, _mm_loadu_si128((const __m128i*)idx), 8));
}
inline v_float64x4 v256_lut_pairs(const double* tab, const int* idx) { return v_float64x4(_mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_loadu_pd(tab + idx[0])), _mm_loadu_pd(tab + idx[1]), 0x1)); }
inline v_int32x8 v_lut(const int* tab, const v_int32x8& idxvec)
{
return v_int32x8(_mm256_i32gather_epi32(tab, idxvec.val, 4));
@ -1476,6 +1567,49 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x8& idxvec, v_flo
y = v_float64x4(_mm256_unpackhi_pd(xy02, xy13));
}
inline v_int8x32 v_interleave_pairs(const v_int8x32& vec)
{
return v_int8x32(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0d0e0c0b090a08, 0x0705060403010200, 0x0f0d0e0c0b090a08, 0x0705060403010200)));
}
inline v_uint8x32 v_interleave_pairs(const v_uint8x32& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
inline v_int8x32 v_interleave_quads(const v_int8x32& vec)
{
return v_int8x32(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0b0e0a0d090c08, 0x0703060205010400, 0x0f0b0e0a0d090c08, 0x0703060205010400)));
}
inline v_uint8x32 v_interleave_quads(const v_uint8x32& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
inline v_int16x16 v_interleave_pairs(const v_int16x16& vec)
{
return v_int16x16(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0e0b0a0d0c0908, 0x0706030205040100, 0x0f0e0b0a0d0c0908, 0x0706030205040100)));
}
inline v_uint16x16 v_interleave_pairs(const v_uint16x16& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
inline v_int16x16 v_interleave_quads(const v_int16x16& vec)
{
return v_int16x16(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0e07060d0c0504, 0x0b0a030209080100, 0x0f0e07060d0c0504, 0x0b0a030209080100)));
}
inline v_uint16x16 v_interleave_quads(const v_uint16x16& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
inline v_int32x8 v_interleave_pairs(const v_int32x8& vec)
{
return v_int32x8(_mm256_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0)));
}
inline v_uint32x8 v_interleave_pairs(const v_uint32x8& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
inline v_float32x8 v_interleave_pairs(const v_float32x8& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
inline v_int8x32 v_pack_triplets(const v_int8x32& vec)
{
return v_int8x32(_mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(vec.val, _mm256_broadcastsi128_si256(_mm_set_epi64x(0xffffff0f0e0d0c0a, 0x0908060504020100))),
_mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
}
inline v_uint8x32 v_pack_triplets(const v_uint8x32& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
inline v_int16x16 v_pack_triplets(const v_int16x16& vec)
{
return v_int16x16(_mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(vec.val, _mm256_broadcastsi128_si256(_mm_set_epi64x(0xffff0f0e0d0c0b0a, 0x0908050403020100))),
_mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
}
inline v_uint16x16 v_pack_triplets(const v_uint16x16& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
////////// Matrix operations /////////
inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)

@ -1799,6 +1799,28 @@ template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a)
return c;
}
template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut(const _Tp* tab, const int* idx)
{
v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++)
c.s[i] = tab[idx[i]];
return c;
}
template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut_pairs(const _Tp* tab, const int* idx)
{
v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++)
c.s[i] = tab[idx[i / 2] + i % 2];
return c;
}
template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut_quads(const _Tp* tab, const int* idx)
{
v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++)
c.s[i] = tab[idx[i / 4] + i % 4];
return c;
}
template<int n> inline v_reg<int, n> v_lut(const int* tab, const v_reg<int, n>& idx)
{
v_reg<int, n> c;
@ -1807,6 +1829,14 @@ template<int n> inline v_reg<int, n> v_lut(const int* tab, const v_reg<int, n>&
return c;
}
template<int n> inline v_reg<unsigned, n> v_lut(const unsigned* tab, const v_reg<int, n>& idx)
{
v_reg<int, n> c;
for (int i = 0; i < n; i++)
c.s[i] = tab[idx.s[i]];
return c;
}
template<int n> inline v_reg<float, n> v_lut(const float* tab, const v_reg<int, n>& idx)
{
v_reg<float, n> c;
@ -1845,6 +1875,49 @@ template<int n> inline void v_lut_deinterleave(const double* tab, const v_reg<in
}
}
template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_pairs(const v_reg<_Tp, n>& vec)
{
v_reg<float, n> c;
for (int i = 0; i < n/4; i++)
{
c.s[4*i ] = vec.s[4*i ];
c.s[4*i+1] = vec.s[4*i+2];
c.s[4*i+2] = vec.s[4*i+1];
c.s[4*i+3] = vec.s[4*i+3];
}
return c;
}
template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_quads(const v_reg<_Tp, n>& vec)
{
v_reg<float, n> c;
for (int i = 0; i < n/8; i++)
{
c.s[8*i ] = vec.s[8*i ];
c.s[8*i+1] = vec.s[8*i+4];
c.s[8*i+2] = vec.s[8*i+1];
c.s[8*i+3] = vec.s[8*i+5];
c.s[8*i+4] = vec.s[8*i+2];
c.s[8*i+5] = vec.s[8*i+6];
c.s[8*i+6] = vec.s[8*i+3];
c.s[8*i+7] = vec.s[8*i+7];
}
return c;
}
template<typename _Tp, int n> inline v_reg<_Tp, n> v_pack_triplets(const v_reg<_Tp, n>& vec)
{
v_reg<float, n> c;
int j = 0;
for (int i = 0; i < n/4; i++)
{
c.s[3*i ] = vec.s[4*i ];
c.s[3*i+1] = vec.s[4*i+1];
c.s[3*i+2] = vec.s[4*i+2];
}
return c;
}
/** @brief Transpose 4x4 matrix
Scheme:

@ -1572,6 +1572,162 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
////////////// Lookup table access ////////////////////
inline v_int8x16 v_lut(const schar* tab, const int* idx)
{
schar CV_DECL_ALIGNED(32) elems[16] =
{
tab[idx[ 0]],
tab[idx[ 1]],
tab[idx[ 2]],
tab[idx[ 3]],
tab[idx[ 4]],
tab[idx[ 5]],
tab[idx[ 6]],
tab[idx[ 7]],
tab[idx[ 8]],
tab[idx[ 9]],
tab[idx[10]],
tab[idx[11]],
tab[idx[12]],
tab[idx[13]],
tab[idx[14]],
tab[idx[15]]
};
return v_int8x16(vld1q_s8(elems));
}
inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
{
short CV_DECL_ALIGNED(32) elems[8] =
{
*(short*)(tab+idx[0]),
*(short*)(tab+idx[1]),
*(short*)(tab+idx[2]),
*(short*)(tab+idx[3]),
*(short*)(tab+idx[4]),
*(short*)(tab+idx[5]),
*(short*)(tab+idx[6]),
*(short*)(tab+idx[7])
};
return v_int8x16(vreinterpretq_s8_s16(vld1q_s16(elems)));
}
inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
{
int CV_DECL_ALIGNED(32) elems[4] =
{
*(int*)(tab + idx[0]),
*(int*)(tab + idx[1]),
*(int*)(tab + idx[2]),
*(int*)(tab + idx[3])
};
return v_int8x16(vreinterpretq_s8_s32(vld1q_s32(elems)));
}
inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
inline v_int16x8 v_lut(const short* tab, const int* idx)
{
short CV_DECL_ALIGNED(32) elems[8] =
{
tab[idx[0]],
tab[idx[1]],
tab[idx[2]],
tab[idx[3]],
tab[idx[4]],
tab[idx[5]],
tab[idx[6]],
tab[idx[7]]
};
return v_int16x8(vld1q_s16(elems));
}
inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
{
int CV_DECL_ALIGNED(32) elems[4] =
{
*(int*)(tab + idx[0]),
*(int*)(tab + idx[1]),
*(int*)(tab + idx[2]),
*(int*)(tab + idx[3])
};
return v_int16x8(vreinterpretq_s16_s32(vld1q_s32(elems)));
}
inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
{
int64 CV_DECL_ALIGNED(32) elems[2] =
{
*(int64*)(tab + idx[0]),
*(int64*)(tab + idx[1])
};
return v_int16x8(vreinterpretq_s16_s64(vld1q_s64(elems)));
}
inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
inline v_int32x4 v_lut(const int* tab, const int* idx)
{
int CV_DECL_ALIGNED(32) elems[4] =
{
tab[idx[0]],
tab[idx[1]],
tab[idx[2]],
tab[idx[3]]
};
return v_int32x4(vld1q_s32(elems));
}
inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
{
int64 CV_DECL_ALIGNED(32) elems[2] =
{
*(int64*)(tab + idx[0]),
*(int64*)(tab + idx[1])
};
return v_int32x4(vreinterpretq_s32_s64(vld1q_s64(elems)));
}
inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
{
return v_int32x4(vld1q_s32(tab + idx[0]));
}
inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
{
return v_int64x2(vcombine_s64(vcreate_s64(tab[idx[0]]), vcreate_s64(tab[idx[1]])));
}
inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
{
return v_int64x2(vld1q_s64(tab + idx[0]));
}
inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
inline v_float32x4 v_lut(const float* tab, const int* idx)
{
float CV_DECL_ALIGNED(32) elems[4] =
{
tab[idx[0]],
tab[idx[1]],
tab[idx[2]],
tab[idx[3]]
};
return v_float32x4(vld1q_f32(elems));
}
inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
{
uint64 CV_DECL_ALIGNED(32) elems[2] =
{
*(uint64*)(tab + idx[0]),
*(uint64*)(tab + idx[1])
};
return v_float32x4(vreinterpretq_f32_u64(vld1q_u64(elems)));
}
inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
{
return v_float32x4(vld1q_f32(tab + idx[0]));
}
inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
{
int CV_DECL_ALIGNED(32) elems[4] =
@ -1584,6 +1740,18 @@ inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
return v_int32x4(vld1q_s32(elems));
}
inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
{
unsigned CV_DECL_ALIGNED(32) elems[4] =
{
tab[vgetq_lane_s32(idxvec.val, 0)],
tab[vgetq_lane_s32(idxvec.val, 1)],
tab[vgetq_lane_s32(idxvec.val, 2)],
tab[vgetq_lane_s32(idxvec.val, 3)]
};
return v_uint32x4(vld1q_u32(elems));
}
inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
{
float CV_DECL_ALIGNED(32) elems[4] =
@ -1614,7 +1782,64 @@ inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_floa
y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
}
inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
{
return v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0705060403010200)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0705060403010200))));
}
inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
{
return v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0703060205010400)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0703060205010400))));
}
inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
{
return v_int16x8(vreinterpretq_s16_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100)), vtbl1_s8(vget_high_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100)))));
}
inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
{
return v_int16x8(vreinterpretq_s16_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0b0a030209080100)), vtbl1_s8(vget_high_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0b0a030209080100)))));
}
inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
{
int32x2x2_t res = vzip_s32(vget_low_s32(vec.val), vget_high_s32(vec.val));
return v_int32x4(vcombine_s32(res.val[0], res.val[1]));
}
inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
{
return v_int8x16(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0605040201000000)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0807060504020100))), vdupq_n_s8(0), 2));
}
inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
{
return v_int16x8(vreinterpretq_s16_s8(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0504030201000000)), vget_high_s8(vreinterpretq_s8_s16(vec.val))), vdupq_n_s8(0), 2)));
}
inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
#if CV_SIMD128_64F
inline v_float64x2 v_lut(const double* tab, const int* idx)
{
double CV_DECL_ALIGNED(32) elems[2] =
{
tab[idx[0]],
tab[idx[1]]
};
return v_float64x2(vld1q_f64(elems));
}
inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
{
return v_float64x2(vld1q_f64(tab + idx[0]));
}
inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
{
double CV_DECL_ALIGNED(32) elems[2] =

@ -2699,6 +2699,126 @@ inline void v_store_fp16(short* ptr, const v_float32x4& a)
////////////// Lookup table access ////////////////////
inline v_int8x16 v_lut(const schar* tab, const int* idx)
{
#if defined(_MSC_VER)
return v_int8x16(_mm_setr_epi8(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]));
#else
return v_int8x16(_mm_setr_epi64(
_mm_setr_pi8(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]]),
_mm_setr_pi8(tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]])
));
#endif
}
inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
{
#if defined(_MSC_VER)
return v_int8x16(_mm_setr_epi16(*(const short*)(tab + idx[0]), *(const short*)(tab + idx[1]), *(const short*)(tab + idx[2]), *(const short*)(tab + idx[3]),
*(const short*)(tab + idx[4]), *(const short*)(tab + idx[5]), *(const short*)(tab + idx[6]), *(const short*)(tab + idx[7])));
#else
return v_int8x16(_mm_setr_epi64(
_mm_setr_pi16(*(const short*)(tab + idx[0]), *(const short*)(tab + idx[1]), *(const short*)(tab + idx[2]), *(const short*)(tab + idx[3])),
_mm_setr_pi16(*(const short*)(tab + idx[4]), *(const short*)(tab + idx[5]), *(const short*)(tab + idx[6]), *(const short*)(tab + idx[7]))
));
#endif
}
inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
{
#if defined(_MSC_VER)
return v_int8x16(_mm_setr_epi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
*(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
#else
return v_int8x16(_mm_setr_epi64(
_mm_setr_pi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1])),
_mm_setr_pi32(*(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]))
));
#endif
}
inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((const schar *)tab, idx)); }
inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((const schar *)tab, idx)); }
inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((const schar *)tab, idx)); }
inline v_int16x8 v_lut(const short* tab, const int* idx)
{
#if defined(_MSC_VER)
return v_int16x8(_mm_setr_epi16(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
#else
return v_int16x8(_mm_setr_epi64(
_mm_setr_pi16(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]),
_mm_setr_pi16(tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]])
));
#endif
}
inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
{
#if defined(_MSC_VER)
return v_int16x8(_mm_setr_epi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
*(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
#else
return v_int16x8(_mm_setr_epi64(
_mm_setr_pi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1])),
_mm_setr_pi32(*(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]))
));
#endif
}
inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
{
return v_int16x8(_mm_set_epi64x(*(const int64_t*)(tab + idx[1]), *(const int64_t*)(tab + idx[0])));
}
inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((const short *)tab, idx)); }
inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((const short *)tab, idx)); }
inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((const short *)tab, idx)); }
inline v_int32x4 v_lut(const int* tab, const int* idx)
{
#if defined(_MSC_VER)
return v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]],
tab[idx[2]], tab[idx[3]]));
#else
return v_int32x4(_mm_setr_epi64(
_mm_setr_pi32(tab[idx[0]], tab[idx[1]]),
_mm_setr_pi32(tab[idx[2]], tab[idx[3]])
));
#endif
}
inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
{
return v_int32x4(_mm_set_epi64x(*(const int64_t*)(tab + idx[1]), *(const int64_t*)(tab + idx[0])));
}
inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
{
return v_int32x4(_mm_load_si128((const __m128i*)(tab + idx[0])));
}
inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int *)tab, idx)); }
inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int *)tab, idx)); }
inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int *)tab, idx)); }
inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
{
return v_int64x2(_mm_set_epi64x(tab[idx[1]], tab[idx[0]]));
}
inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
{
return v_int64x2(_mm_load_si128((const __m128i*)(tab + idx[0])));
}
inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
inline v_float32x4 v_lut(const float* tab, const int* idx)
{
return v_float32x4(_mm_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
}
inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_pairs((const int *)tab, idx)); }
inline v_float32x4 v_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_quads((const int *)tab, idx)); }
inline v_float64x2 v_lut(const double* tab, const int* idx)
{
return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]]));
}
inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) { return v_float64x2(_mm_castsi128_pd(_mm_load_si128((const __m128i*)(tab + idx[0])))); }
inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
{
int CV_DECL_ALIGNED(32) idx[4];
@ -2706,6 +2826,11 @@ inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
return v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
}
inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
{
return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
}
inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
{
int CV_DECL_ALIGNED(32) idx[4];
@ -2751,6 +2876,77 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
y = v_float64x2(_mm_unpackhi_pd(xy0, xy1));
}
inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
{
#if CV_SSSE3
return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0d0e0c0b090a08, 0x0705060403010200)));
#else
__m128i a = _mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(3, 1, 2, 0));
a = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
return v_int8x16(_mm_unpacklo_epi8(a, _mm_unpackhi_epi64(a, a)));
#endif
}
inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
{
#if CV_SSSE3
return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0b0e0a0d090c08, 0x0703060205010400)));
#else
__m128i a = _mm_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
return v_int8x16(_mm_unpacklo_epi8(a, _mm_unpackhi_epi64(a, a)));
#endif
}
inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
{
#if CV_SSSE3
return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0e0b0a0d0c0908, 0x0706030205040100)));
#else
__m128i a = _mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
return v_int16x8(_mm_shufflehi_epi16(a, _MM_SHUFFLE(3, 1, 2, 0)));
#endif
}
inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
{
#if CV_SSSE3
return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0e07060d0c0504, 0x0b0a030209080100)));
#else
return v_int16x8(_mm_unpacklo_epi16(vec.val, _mm_unpackhi_epi64(vec.val, vec.val)));
#endif
}
inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
{
return v_int32x4(_mm_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0)));
}
inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
{
#if CV_SSSE3
return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0xffffff0f0e0d0c0a, 0x0908060504020100)));
#else
__m128i mask = _mm_set1_epi64x(0x00000000FFFFFFFF);
__m128i a = _mm_or_si128(_mm_andnot_si128(mask, vec.val), _mm_and_si128(mask, _mm_sll_epi32(vec.val, _mm_set_epi64x(0, 8))));
return v_int8x16(_mm_srli_si128(_mm_shufflelo_epi16(a, _MM_SHUFFLE(2, 1, 0, 3)), 2));
#endif
}
inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
{
#if CV_SSSE3
return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0xffff0f0e0d0c0b0a, 0x0908050403020100)));
#else
return v_int16x8(_mm_srli_si128(_mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(2, 1, 0, 3)), 2));
#endif
}
inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
////////////// FP16 support ///////////////////////////

@ -993,6 +993,80 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
////////////// Lookup table access ////////////////////
inline v_int8x16 v_lut(const schar* tab, const int* idx)
{
return v_int8x16(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]], tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]],
tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]);
}
inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
{
return v_reinterpret_as_s8(v_int16x8(*(const short*)(tab+idx[0]), *(const short*)(tab+idx[1]), *(const short*)(tab+idx[2]), *(const short*)(tab+idx[3]),
*(const short*)(tab+idx[4]), *(const short*)(tab+idx[5]), *(const short*)(tab+idx[6]), *(const short*)(tab+idx[7])));
}
inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
{
return v_reinterpret_as_s8(v_int32x4(*(const int*)(tab+idx[0]), *(const int*)(tab+idx[1]), *(const int*)(tab+idx[2]), *(const int*)(tab+idx[3])));
}
inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((const schar*)tab, idx)); }
inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((const schar*)tab, idx)); }
inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((const schar*)tab, idx)); }
inline v_int16x8 v_lut(const short* tab, const int* idx)
{
return v_int16x8(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]], tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]);
}
inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
{
return v_reinterpret_as_s16(v_int32x4(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]), *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
}
inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
{
return v_reinterpret_as_s16(v_int64x2(*(const int64*)(tab + idx[0]), *(const int64*)(tab + idx[1])));
}
inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((const short*)tab, idx)); }
inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((const short*)tab, idx)); }
inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((const short*)tab, idx)); }
inline v_int32x4 v_lut(const int* tab, const int* idx)
{
return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
}
inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
{
return v_reinterpret_as_s32(v_int64x2(*(const int64*)(tab + idx[0]), *(const int64*)(tab + idx[1])));
}
inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
{
return v_int32x4(vsx_ld(0, tab + idx[0]));
}
inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int*)tab, idx)); }
inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int*)tab, idx)); }
inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int*)tab, idx)); }
inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
{
return v_int64x2(tab[idx[0]], tab[idx[1]]);
}
inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
{
return v_int64x2(vsx_ld2(0, tab + idx[0]));
}
inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
inline v_float32x4 v_lut(const float* tab, const int* idx)
{
return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
}
inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_pairs((const int*)tab, idx)); }
inline v_float32x4 v_lut_quads(const float* tab, const int* idx) { return v_load(tab + *idx); }
inline v_float64x2 v_lut(const double* tab, const int* idx)
{
return v_float64x2(tab[idx[0]], tab[idx[1]]);
}
inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) { return v_load(tab + *idx); }
inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
{
int CV_DECL_ALIGNED(32) idx[4];
@ -1000,6 +1074,13 @@ inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
}
inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
{
int CV_DECL_ALIGNED(32) idx[4];
v_store_aligned(idx, idxvec);
return v_uint32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
}
inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
{
int CV_DECL_ALIGNED(32) idx[4];
@ -1030,6 +1111,55 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
}
inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
{
vec_short8 vec0 = vec_mergeh((vec_short8)vec.val, (vec_short8)vec_mergesql(vec.val, vec.val));
vec0 = vec_mergeh(vec0, vec_mergesql(vec0, vec0));
return v_int8x16(vec_mergeh((vec_char16)vec0, (vec_char16)vec_mergesql(vec0, vec0)));
}
inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
{
vec_char16 vec0 = (vec_char16)vec_mergeh((vec_int4)vec.val, (vec_int4)vec_mergesql(vec.val, vec.val));
return v_int8x16(vec_mergeh(vec0, vec_mergesql(vec0, vec0)));
}
inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
{
vec_short8 vec0 = (vec_short8)vec_mergeh((vec_int4)vec.val, (vec_int4)vec_mergesql(vec.val, vec.val));
return v_int16x8(vec_mergeh(vec0, vec_mergesql(vec0, vec0)));
}
inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
{
return v_int16x8(vec_mergeh(vec.val, vec_mergesql(vec.val, vec.val)));
}
inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
{
return v_int32x4(vec_mergeh(vec.val, vec_mergesql(vec.val, vec.val)));
}
inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
{
schar CV_DECL_ALIGNED(32) val[16];
v_store_aligned(val, vec);
return v_int8x16(val[0], val[1], val[2], val[4], val[5], val[6], val[8], val[9], val[10], val[12], val[13], val[14], val[15], val[15], val[15], val[15]);
}
inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
{
short CV_DECL_ALIGNED(32) val[8];
v_store_aligned(val, vec);
return v_int16x8(val[0], val[1], val[2], val[4], val[5], val[6], val[7], val[7]);
}
inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
/////// FP16 support ////////
// [TODO] implement these 2 using VSX or universal intrinsics (copy from intrin_sse.cpp and adopt)

@ -340,155 +340,6 @@ static void hlineResizeCn(ET* src, int cn, int *ofst, FT* m, FT* dst, int dst_mi
hline<ET, FT, n, mulall, cncnt>::ResizeCn(src, cn, ofst, m, dst, dst_min, dst_max, dst_width);
};
#if CV_SIMD512
inline void v_load_indexed1(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1)
{
v_expand(v_reinterpret_as_u8(v_uint16(
*((uint16_t*)(src + ofst[ 0])), *((uint16_t*)(src + ofst[ 1])), *((uint16_t*)(src + ofst[ 2])), *((uint16_t*)(src + ofst[ 3])),
*((uint16_t*)(src + ofst[ 4])), *((uint16_t*)(src + ofst[ 5])), *((uint16_t*)(src + ofst[ 6])), *((uint16_t*)(src + ofst[ 7])),
*((uint16_t*)(src + ofst[ 8])), *((uint16_t*)(src + ofst[ 9])), *((uint16_t*)(src + ofst[10])), *((uint16_t*)(src + ofst[11])),
*((uint16_t*)(src + ofst[12])), *((uint16_t*)(src + ofst[13])), *((uint16_t*)(src + ofst[14])), *((uint16_t*)(src + ofst[15])),
*((uint16_t*)(src + ofst[16])), *((uint16_t*)(src + ofst[17])), *((uint16_t*)(src + ofst[14])), *((uint16_t*)(src + ofst[15])),
*((uint16_t*)(src + ofst[20])), *((uint16_t*)(src + ofst[21])), *((uint16_t*)(src + ofst[14])), *((uint16_t*)(src + ofst[15])),
*((uint16_t*)(src + ofst[24])), *((uint16_t*)(src + ofst[25])), *((uint16_t*)(src + ofst[14])), *((uint16_t*)(src + ofst[15])),
*((uint16_t*)(src + ofst[28])), *((uint16_t*)(src + ofst[29])), *((uint16_t*)(src + ofst[14])), *((uint16_t*)(src + ofst[15])))),
v_src0, v_src1);
}
inline void v_load_indexed2(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1)
{
v_expand(v_reinterpret_as_u8(v_uint32(
*((uint32_t*)(src + 2 * ofst[ 0])), *((uint32_t*)(src + 2 * ofst[ 1])), *((uint32_t*)(src + 2 * ofst[ 2])), *((uint32_t*)(src + 2 * ofst[ 3])),
*((uint32_t*)(src + 2 * ofst[ 4])), *((uint32_t*)(src + 2 * ofst[ 5])), *((uint32_t*)(src + 2 * ofst[ 6])), *((uint32_t*)(src + 2 * ofst[ 7])),
*((uint32_t*)(src + 2 * ofst[ 8])), *((uint32_t*)(src + 2 * ofst[ 9])), *((uint32_t*)(src + 2 * ofst[10])), *((uint32_t*)(src + 2 * ofst[11])),
*((uint32_t*)(src + 2 * ofst[12])), *((uint32_t*)(src + 2 * ofst[13])), *((uint32_t*)(src + 2 * ofst[14])), *((uint32_t*)(src + 2 * ofst[15])))),
v_src0, v_src1);
v_uint32 v_tmp0, v_tmp1, v_tmp2, v_tmp3;
v_zip(v_reinterpret_as_u32(v_src0), v_reinterpret_as_u32(v_src1), v_tmp2, v_tmp3);
v_zip(v_tmp2, v_tmp3, v_tmp0, v_tmp1);
v_zip(v_tmp0, v_tmp1, v_tmp2, v_tmp3);
v_zip(v_tmp2, v_tmp3, v_tmp0, v_tmp1);
v_zip(v_reinterpret_as_u16(v_tmp0), v_reinterpret_as_u16(v_tmp1), v_src0, v_src1);
}
inline void v_load_indexed4(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1)
{
v_expand(v_reinterpret_as_u8(v_uint64(
*((uint64_t*)(src + 4 * ofst[0])), *((uint64_t*)(src + 4 * ofst[1])), *((uint64_t*)(src + 4 * ofst[2])), *((uint64_t*)(src + 4 * ofst[3])),
*((uint64_t*)(src + 4 * ofst[4])), *((uint64_t*)(src + 4 * ofst[5])), *((uint64_t*)(src + 4 * ofst[6])), *((uint64_t*)(src + 4 * ofst[7])))),
v_src0, v_src1);
v_uint64 v_tmp0, v_tmp1, v_tmp2, v_tmp3;
v_zip(v_reinterpret_as_u64(v_src0), v_reinterpret_as_u64(v_src1), v_tmp2, v_tmp3);
v_zip(v_tmp2, v_tmp3, v_tmp0, v_tmp1);
v_zip(v_tmp0, v_tmp1, v_tmp2, v_tmp3);
v_zip(v_reinterpret_as_u16(v_tmp2), v_reinterpret_as_u16(v_tmp3), v_src0, v_src1);
}
inline void v_load_indexed_deinterleave(uint16_t* src, int *ofst, v_uint32 &v_src0, v_uint32 &v_src1)
{
v_expand(v_reinterpret_as_u16(v_uint32(
*((uint32_t*)(src + ofst[ 0])), *((uint32_t*)(src + ofst[ 1])), *((uint32_t*)(src + ofst[ 2])), *((uint32_t*)(src + ofst[ 3])),
*((uint32_t*)(src + ofst[ 4])), *((uint32_t*)(src + ofst[ 5])), *((uint32_t*)(src + ofst[ 6])), *((uint32_t*)(src + ofst[ 7])),
*((uint32_t*)(src + ofst[ 8])), *((uint32_t*)(src + ofst[ 9])), *((uint32_t*)(src + ofst[10])), *((uint32_t*)(src + ofst[11])),
*((uint32_t*)(src + ofst[12])), *((uint32_t*)(src + ofst[13])), *((uint32_t*)(src + ofst[14])), *((uint32_t*)(src + ofst[15])))),
v_src0, v_src1);
v_uint32 v_tmp0, v_tmp1;
v_zip(v_src0, v_src1, v_tmp0, v_tmp1);
v_zip(v_tmp0, v_tmp1, v_src0, v_src1);
v_zip(v_src0, v_src1, v_tmp0, v_tmp1);
v_zip(v_tmp0, v_tmp1, v_src0, v_src1);
}
#elif CV_SIMD256
inline void v_load_indexed1(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1)
{
v_expand(v_reinterpret_as_u8(v_uint16(
*((uint16_t*)(src + ofst[ 0])), *((uint16_t*)(src + ofst[ 1])), *((uint16_t*)(src + ofst[ 2])), *((uint16_t*)(src + ofst[ 3])),
*((uint16_t*)(src + ofst[ 4])), *((uint16_t*)(src + ofst[ 5])), *((uint16_t*)(src + ofst[ 6])), *((uint16_t*)(src + ofst[ 7])),
*((uint16_t*)(src + ofst[ 8])), *((uint16_t*)(src + ofst[ 9])), *((uint16_t*)(src + ofst[10])), *((uint16_t*)(src + ofst[11])),
*((uint16_t*)(src + ofst[12])), *((uint16_t*)(src + ofst[13])), *((uint16_t*)(src + ofst[14])), *((uint16_t*)(src + ofst[15])))),
v_src0, v_src1);
}
inline void v_load_indexed2(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1)
{
v_expand(v_reinterpret_as_u8(v_uint32(
*((uint32_t*)(src + 2 * ofst[0])), *((uint32_t*)(src + 2 * ofst[1])), *((uint32_t*)(src + 2 * ofst[2])), *((uint32_t*)(src + 2 * ofst[3])),
*((uint32_t*)(src + 2 * ofst[4])), *((uint32_t*)(src + 2 * ofst[5])), *((uint32_t*)(src + 2 * ofst[6])), *((uint32_t*)(src + 2 * ofst[7])))),
v_src0, v_src1);
v_uint32 v_tmp0, v_tmp1, v_tmp2, v_tmp3;
v_zip(v_reinterpret_as_u32(v_src0), v_reinterpret_as_u32(v_src1), v_tmp2, v_tmp3);
v_zip(v_tmp2, v_tmp3, v_tmp0, v_tmp1);
v_zip(v_tmp0, v_tmp1, v_tmp2, v_tmp3);
v_zip(v_reinterpret_as_u16(v_tmp2), v_reinterpret_as_u16(v_tmp3), v_src0, v_src1);
}
inline void v_load_indexed4(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1)
{
v_expand(v_reinterpret_as_u8(v_uint64(
*((uint64_t*)(src + 4 * ofst[0])), *((uint64_t*)(src + 4 * ofst[1])), *((uint64_t*)(src + 4 * ofst[2])), *((uint64_t*)(src + 4 * ofst[3])))),
v_src0, v_src1);
v_uint64 v_tmp0, v_tmp1, v_tmp2, v_tmp3;
v_zip(v_reinterpret_as_u64(v_src0), v_reinterpret_as_u64(v_src1), v_tmp2, v_tmp3);
v_zip(v_tmp2, v_tmp3, v_tmp0, v_tmp1);
v_zip(v_reinterpret_as_u16(v_tmp0), v_reinterpret_as_u16(v_tmp1), v_src0, v_src1);
}
inline void v_load_indexed_deinterleave(uint16_t* src, int *ofst, v_uint32 &v_src0, v_uint32 &v_src1)
{
v_uint32 v_tmp0, v_tmp1;
v_expand(v_reinterpret_as_u16(v_uint32(
*((uint32_t*)(src + ofst[0])), *((uint32_t*)(src + ofst[1])), *((uint32_t*)(src + ofst[2])), *((uint32_t*)(src + ofst[3])),
*((uint32_t*)(src + ofst[4])), *((uint32_t*)(src + ofst[5])), *((uint32_t*)(src + ofst[6])), *((uint32_t*)(src + ofst[7])))),
v_tmp0, v_tmp1);
v_zip(v_tmp0, v_tmp1, v_src0, v_src1);
v_zip(v_src0, v_src1, v_tmp0, v_tmp1);
v_zip(v_tmp0, v_tmp1, v_src0, v_src1);
}
#elif CV_SIMD128
inline void v_load_indexed1(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1)
{
uint16_t buf[8];
buf[0] = *((uint16_t*)(src + ofst[0]));
buf[1] = *((uint16_t*)(src + ofst[1]));
buf[2] = *((uint16_t*)(src + ofst[2]));
buf[3] = *((uint16_t*)(src + ofst[3]));
buf[4] = *((uint16_t*)(src + ofst[4]));
buf[5] = *((uint16_t*)(src + ofst[5]));
buf[6] = *((uint16_t*)(src + ofst[6]));
buf[7] = *((uint16_t*)(src + ofst[7]));
v_src0 = vx_load_expand((uint8_t*)buf);
v_src1 = vx_load_expand((uint8_t*)buf + 8);
}
inline void v_load_indexed2(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1)
{
uint32_t buf[4];
buf[0] = *((uint32_t*)(src + 2 * ofst[0]));
buf[1] = *((uint32_t*)(src + 2 * ofst[1]));
buf[2] = *((uint32_t*)(src + 2 * ofst[2]));
buf[3] = *((uint32_t*)(src + 2 * ofst[3]));
v_uint32 v_tmp0, v_tmp1, v_tmp2, v_tmp3;
v_tmp0 = v_reinterpret_as_u32(vx_load_expand((uint8_t*)buf));
v_tmp1 = v_reinterpret_as_u32(vx_load_expand((uint8_t*)buf + 8));
v_zip(v_tmp0, v_tmp1, v_tmp2, v_tmp3);
v_zip(v_tmp2, v_tmp3, v_tmp0, v_tmp1);
v_zip(v_reinterpret_as_u16(v_tmp0), v_reinterpret_as_u16(v_tmp1), v_src0, v_src1);
}
inline void v_load_indexed4(uint8_t* src, int *ofst, v_uint16 &v_src0, v_uint16 &v_src1)
{
v_uint16 v_tmp0, v_tmp1;
v_src0 = vx_load_expand(src + 4 * ofst[0]);
v_src1 = vx_load_expand(src + 4 * ofst[1]);
v_recombine(v_src0, v_src1, v_tmp0, v_tmp1);
v_zip(v_tmp0, v_tmp1, v_src0, v_src1);
}
inline void v_load_indexed_deinterleave(uint16_t* src, int *ofst, v_uint32 &v_src0, v_uint32 &v_src1)
{
uint32_t buf[4];
buf[0] = *((uint32_t*)(src + ofst[0]));
buf[1] = *((uint32_t*)(src + ofst[1]));
buf[2] = *((uint32_t*)(src + ofst[2]));
buf[3] = *((uint32_t*)(src + ofst[3]));
v_src0 = vx_load_expand((uint16_t*)buf);
v_src1 = vx_load_expand((uint16_t*)buf + 4);
v_uint32 v_tmp0, v_tmp1;
v_zip(v_src0, v_src1, v_tmp0, v_tmp1);
v_zip(v_tmp0, v_tmp1, v_src0, v_src1);
}
#endif
template <>
void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 1>(uint8_t* src, int, int *ofst, ufixedpoint16* m, ufixedpoint16* dst, int dst_min, int dst_max, int dst_width)
{
@ -507,16 +358,23 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 1>(uint8_t* src, int, int *o
*(dst++) = src_0;
}
#if CV_SIMD
for (; i <= dst_max - VECSZ; i += VECSZ, m += 2*VECSZ, dst += VECSZ)
for (; i <= dst_max - 2*VECSZ; i += 2*VECSZ, m += 4*VECSZ, dst += 2*VECSZ)
{
v_uint16 v_src0, v_src1;
v_load_indexed1(src, ofst + i, v_src0, v_src1);
v_int16 v_mul0 = vx_load((int16_t*)m);
v_int16 v_mul1 = vx_load((int16_t*)m + VECSZ);
v_uint32 v_res0 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), v_mul0));
v_uint32 v_res1 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), v_mul1));
v_store((uint16_t*)dst, v_pack(v_res0, v_res1));
v_expand(vx_lut_pairs(src, ofst + i), v_src0, v_src1);
v_store((uint16_t*)dst , v_pack(v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), vx_load((int16_t*)m))),
v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), vx_load((int16_t*)m + VECSZ)))));
v_expand(vx_lut_pairs(src, ofst + i + VECSZ), v_src0, v_src1);
v_store((uint16_t*)dst+VECSZ, v_pack(v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), vx_load((int16_t*)m + 2*VECSZ))),
v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), vx_load((int16_t*)m + 3*VECSZ)))));
}
if (i <= dst_max - VECSZ)
{
v_uint16 v_src0, v_src1;
v_expand(vx_lut_pairs(src, ofst + i), v_src0, v_src1);
v_store((uint16_t*)dst, v_pack(v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), vx_load((int16_t*)m))),
v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), vx_load((int16_t*)m + VECSZ)))));
i += VECSZ; m += 2*VECSZ; dst += VECSZ;
}
#endif
for (; i < dst_max; i += 1, m += 2)
@ -564,7 +422,7 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 2>(uint8_t* src, int, int *o
for (; i <= dst_max - VECSZ/2; i += VECSZ/2, m += VECSZ, dst += VECSZ)
{
v_uint16 v_src0, v_src1;
v_load_indexed2(src, ofst + i, v_src0, v_src1);
v_expand(v_interleave_pairs(v_reinterpret_as_u8(vx_lut_pairs((uint16_t*)src, ofst + i))), v_src0, v_src1);
v_uint32 v_mul = vx_load((uint32_t*)m);//AaBbCcDd
v_uint32 v_zip0, v_zip1;
@ -595,6 +453,81 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 2>(uint8_t* src, int, int *o
}
}
template <>
void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 3>(uint8_t* src, int, int *ofst, ufixedpoint16* m, ufixedpoint16* dst, int dst_min, int dst_max, int dst_width)
{
int i = 0;
union {
uint64_t q;
uint16_t w[4];
} srccn;
((ufixedpoint16*)(srccn.w))[0] = src[0];
((ufixedpoint16*)(srccn.w))[1] = src[1];
((ufixedpoint16*)(srccn.w))[2] = src[2];
((ufixedpoint16*)(srccn.w))[3] = 0;
#if CV_SIMD
const int VECSZ = v_uint16::nlanes;
v_uint16 v_srccn = v_pack_triplets(v_reinterpret_as_u16(vx_setall_u64(srccn.q)));
for (; i <= dst_min - (VECSZ+2)/3; i += VECSZ/4, m += VECSZ/2, dst += 3*VECSZ/4) // Points that fall left from src image so became equal to leftmost src point
{
v_store((uint16_t*)dst, v_srccn);
}
#endif
for (; i < dst_min; i++, m += 2)
{
*(dst++) = ((ufixedpoint16*)(srccn.w))[0];
*(dst++) = ((ufixedpoint16*)(srccn.w))[1];
*(dst++) = ((ufixedpoint16*)(srccn.w))[2];
}
#if CV_SIMD
CV_DECL_ALIGNED(CV_SIMD_WIDTH) int ofst3[VECSZ/2];
for (; i <= dst_max - (3*VECSZ/4 + (VECSZ+2)/3); i += VECSZ/2, m += VECSZ, dst += 3*VECSZ/2)
{
v_store(ofst3, vx_load(ofst + i) * vx_setall_s32(3));
v_uint8 v_src01, v_src23;
v_uint16 v_src0, v_src1, v_src2, v_src3;
v_zip(vx_lut_quads(src, ofst3), vx_lut_quads(src+3, ofst3), v_src01, v_src23);
v_expand(v_src01, v_src0, v_src1);
v_expand(v_src23, v_src2, v_src3);
v_uint32 v_mul0, v_mul1, v_mul2, v_mul3, v_tmp;
v_mul0 = vx_load((uint32_t*)m);//AaBbCcDd
v_zip(v_mul0, v_mul0, v_mul3, v_tmp );//AaAaBbBb CcCcDdDd
v_zip(v_mul3, v_mul3, v_mul0, v_mul1);//AaAaAaAa BbBbBbBb
v_zip(v_tmp , v_tmp , v_mul2, v_mul3);//CcCcCcCc DdDdDdDd
v_uint32 v_res0 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), v_reinterpret_as_s16(v_mul0)));
v_uint32 v_res1 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), v_reinterpret_as_s16(v_mul1)));
v_uint32 v_res2 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src2), v_reinterpret_as_s16(v_mul2)));
v_uint32 v_res3 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src3), v_reinterpret_as_s16(v_mul3)));
v_store((uint16_t*)dst , v_pack_triplets(v_pack(v_res0, v_res1)));
v_store((uint16_t*)dst + 3*VECSZ/4, v_pack_triplets(v_pack(v_res2, v_res3)));
}
#endif
for (; i < dst_max; i += 1, m += 2)
{
uint8_t* px = src + 3 * ofst[i];
*(dst++) = m[0] * px[0] + m[1] * px[3];
*(dst++) = m[0] * px[1] + m[1] * px[4];
*(dst++) = m[0] * px[2] + m[1] * px[5];
}
((ufixedpoint16*)(srccn.w))[0] = (src + 3*ofst[dst_width - 1])[0];
((ufixedpoint16*)(srccn.w))[1] = (src + 3*ofst[dst_width - 1])[1];
((ufixedpoint16*)(srccn.w))[2] = (src + 3*ofst[dst_width - 1])[2];
#if CV_SIMD
v_srccn = v_pack_triplets(v_reinterpret_as_u16(vx_setall_u64(srccn.q)));
for (; i <= dst_width - (VECSZ+2)/3; i += VECSZ/4, dst += 3*VECSZ/4) // Points that fall right from src image so became equal to rightmost src point
{
v_store((uint16_t*)dst, v_srccn);
}
#endif
for (; i < dst_width; i++)
{
*(dst++) = ((ufixedpoint16*)(srccn.w))[0];
*(dst++) = ((ufixedpoint16*)(srccn.w))[1];
*(dst++) = ((ufixedpoint16*)(srccn.w))[2];
}
}
template <>
void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 4>(uint8_t* src, int, int *ofst, ufixedpoint16* m, ufixedpoint16* dst, int dst_min, int dst_max, int dst_width)
{
int i = 0;
@ -614,20 +547,19 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 4>(uint8_t* src, int, int *o
v_store((uint16_t*)dst, v_srccn);
}
#endif
if (i < dst_min) // Points that fall left from src image so became equal to leftmost src point
for (; i < dst_min; i++, m += 2)
{
*(dst++) = ((ufixedpoint16*)(srccn.w))[0];
*(dst++) = ((ufixedpoint16*)(srccn.w))[1];
*(dst++) = ((ufixedpoint16*)(srccn.w))[2];
*(dst++) = ((ufixedpoint16*)(srccn.w))[3];
i++; m += 2;
}
#if CV_SIMD
for (; i <= dst_max - VECSZ/2; i += VECSZ/2, m += VECSZ, dst += 2*VECSZ)
{
v_uint16 v_src0, v_src1, v_src2, v_src3;
v_load_indexed4(src, ofst + i, v_src0, v_src1);
v_load_indexed4(src, ofst + i + VECSZ/4, v_src2, v_src3);
v_expand(v_interleave_quads(v_reinterpret_as_u8(vx_lut_pairs((uint32_t*)src, ofst + i))), v_src0, v_src1);
v_expand(v_interleave_quads(v_reinterpret_as_u8(vx_lut_pairs((uint32_t*)src, ofst + i + VECSZ/4))), v_src2, v_src3);
v_uint32 v_mul0, v_mul1, v_mul2, v_mul3, v_tmp;
v_mul0 = vx_load((uint32_t*)m);//AaBbCcDd
@ -660,7 +592,7 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 4>(uint8_t* src, int, int *o
v_store((uint16_t*)dst, v_srccn);
}
#endif
if (i < dst_width)
for (; i < dst_width; i++)
{
*(dst++) = ((ufixedpoint16*)(srccn.w))[0];
*(dst++) = ((ufixedpoint16*)(srccn.w))[1];
@ -689,10 +621,12 @@ void hlineResizeCn<uint16_t, ufixedpoint32, 2, true, 1>(uint16_t* src, int, int
for (; i <= dst_max - VECSZ; i += VECSZ, m += 2*VECSZ, dst += VECSZ)
{
v_uint32 v_src0, v_src1;
v_load_indexed_deinterleave(src, ofst + i, v_src0, v_src1);
v_uint32 v_mul0, v_mul1;
v_load_deinterleave((uint32_t*)m, v_mul0, v_mul1);
v_store((uint32_t*)dst, v_src0 * v_mul0 + v_src1 * v_mul1);//abcd
v_expand(vx_lut_pairs(src, ofst + i), v_src0, v_src1);
v_uint64 v_res0 = v_reinterpret_as_u64(v_src0 * vx_load((uint32_t*)m));
v_uint64 v_res1 = v_reinterpret_as_u64(v_src1 * vx_load((uint32_t*)m + VECSZ));
v_store((uint32_t*)dst, v_pack((v_res0 & vx_setall_u64(0xFFFFFFFF)) + (v_res0 >> 32),
(v_res1 & vx_setall_u64(0xFFFFFFFF)) + (v_res1 >> 32)));
}
#endif
for (; i < dst_max; i += 1, m += 2)

Loading…
Cancel
Save