|
|
|
@ -113,9 +113,16 @@ static inline void store_interleave(float* ptr, const __m256& a, const __m256& b |
|
|
|
|
v_transpose4x4(v_uint32x4(a1), v_uint32x4(b1), v_uint32x4(c1), z, u0, u1, u2, u3); |
|
|
|
|
v_pack4x3to3x4(u0.val, u1.val, u2.val, u3.val, a1, b1, c1); |
|
|
|
|
|
|
|
|
|
#if !defined(__GNUC__) || defined(__INTEL_COMPILER) |
|
|
|
|
_mm256_storeu_ps(ptr, _mm256_setr_m128(_mm_castsi128_ps(a0), _mm_castsi128_ps(b0))); |
|
|
|
|
_mm256_storeu_ps(ptr + 8, _mm256_setr_m128(_mm_castsi128_ps(c0), _mm_castsi128_ps(a1))); |
|
|
|
|
_mm256_storeu_ps(ptr + 16, _mm256_setr_m128(_mm_castsi128_ps(b1), _mm_castsi128_ps(c1))); |
|
|
|
|
#else |
|
|
|
|
// GCC: workaround for missing AVX intrinsic: "_mm256_setr_m128()"
|
|
|
|
|
_mm256_storeu_ps(ptr, _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_castsi128_ps(a0)), _mm_castsi128_ps(b0), 1)); |
|
|
|
|
_mm256_storeu_ps(ptr + 8, _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_castsi128_ps(c0)), _mm_castsi128_ps(a1), 1)); |
|
|
|
|
_mm256_storeu_ps(ptr + 16, _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_castsi128_ps(b1)), _mm_castsi128_ps(c1), 1)); |
|
|
|
|
#endif |
|
|
|
|
} |
|
|
|
|
#endif // CV_AVX
|
|
|
|
|
|
|
|
|
|