Merge pull request #14210 from terfendail:wui_512

AVX512 wide universal intrinsics (#14210)

* Added implementation of 512-bit wide universal intrinsics(WIP)

* Added implementation of 512-bit wide universal intrinsics: implemented WUI vector types(WIP)

* Added implementation of 512-bit wide universal intrinsics(WIP): implemented load/store

* Added implementation of 512-bit wide universal intrinsics(WIP): implemented fp16 load/store

* Added implementation of 512-bit wide universal intrinsics(WIP): implemented recombine and zip, implemented non-saturating and saturating arithmetics

* Added implementation of 512-bit wide universal intrinsics(WIP): implemented bit operations

* Added implementation of 512-bit wide universal intrinsics(WIP): implemented comparisons

* Added implementation of 512-bit wide universal intrinsics(WIP): implemented lane shifts and reduction

* Added implementation of 512-bit wide universal intrinsics(WIP): implemented absolute values

* Added implementation of 512-bit wide universal intrinsics(WIP): implemented rounding and cast to float

* Added implementation of 512-bit wide universal intrinsics(WIP): implemented LUT

* Added implementation of 512-bit wide universal intrinsics(WIP): implemented type extension/narrowing and matrix operations

* Added implementation of 512-bit wide universal intrinsics(WIP): implemented load_deinterleave for 2 and 3 channels images

* Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented load_deinterleave for 2- and implemented for 4-channel images

* Added implementation of 512-bit wide universal intrinsics(WIP): implemented store_interleave

* Added implementation of 512-bit wide universal intrinsics(WIP): implemented signmask and checks

* Added implementation of 512-bit wide universal intrinsics(WIP): build fixes

* Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented popcount in case AVX512_BITALG is unavailable

* Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented zip

* Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented rotate for s8 and s16

* Added implementation of 512-bit wide universal intrinsics(WIP): reimplemented interleave/deinterleave for s8 and s16

* Added implementation of 512-bit wide universal intrinsics(WIP): updated v512_set macros

* Added implementation of 512-bit wide universal intrinsics(WIP): fix for GCC wrong _mm512_abs_pd definition

* Added implementation of 512-bit wide universal intrinsics(WIP): reworked v_zip to avoid AVX512_VBMI intrinsics

* Added implementation of 512-bit wide universal intrinsics(WIP): reworked v_invsqrt to avoid AVX512_ER intrinsics

* Added implementation of 512-bit wide universal intrinsics(WIP): reworked v_rotate, v_popcount and interleave/deinterleave for U8 to avoid AVX512_VBMI intrinsics

* Added implementation of 512-bit wide universal intrinsics(WIP): fixed integral image SIMD part

* Added implementation of 512-bit wide universal intrinsics(WIP): fixed warnings

* Added implementation of 512-bit wide universal intrinsics(WIP): fixed load_deinterleave for u8 and u16

* Added implementation of 512-bit wide universal intrinsics(WIP): fixed v_invsqrt accuracy for f64

* Added implementation of 512-bit wide universal intrinsics(WIP): fixed interleave/deinterleave for u32 and u64

* Added implementation of 512-bit wide universal intrinsics(WIP): fixed interleave_pairs, interleave_quads and pack_triplets

* Added implementation of 512-bit wide universal intrinsics(WIP): fixed rotate_left

* Added implementation of 512-bit wide universal intrinsics(WIP): fixed rotate_left/right, part 2

* Added implementation of 512-bit wide universal intrinsics(WIP): fixed 512-wide universal intrinsics based resize

* Added implementation of 512-bit wide universal intrinsics(WIP): fixed findContours by avoiding use of uint64 dependent 512-wide v_signmask()

* Added implementation of 512-bit wide universal intrinsics(WIP): fixed trailing whitespaces

* Added implementation of 512-bit wide universal intrinsics(WIP): reworked specific intrinsic sets dependent parts to check availability of intrinsics based on CPU feature group defines

* Added implementation of 512-bit wide universal intrinsics(WIP):Updated AVX512 implementation of v_popcount to avoid AVX512VPOPCNTDQ intrinsics if unavailable.

* Added implementation of 512-bit wide universal intrinsics(WIP): Fixed universal intrinsics data initialisation, v_mul_wrap, v_floor, v_ceil and v_signmask.

* Added implementation of 512-bit wide universal intrinsics(WIP): Removed hasSIMD512()

* Added implementation of 512-bit wide universal intrinsics(WIP): Fixes for gcc build

* Added implementation of 512-bit wide universal intrinsics(WIP): Reworked v_signmask, v_check_any() and v_check_all() implementation.
pull/14710/head
Vitaly Tuzov 6 years ago committed by Alexander Alekhin
parent 3289a0aff9
commit 3b015dfc7d
  1. 5
      modules/core/CMakeLists.txt
  2. 42
      modules/core/include/opencv2/core/hal/intrin.hpp
  3. 2743
      modules/core/include/opencv2/core/hal/intrin_avx512.hpp
  4. 29
      modules/core/include/opencv2/core/hal/intrin_forward.hpp
  5. 33
      modules/core/test/test_intrin.cpp
  6. 23
      modules/core/test/test_intrin512.simd.hpp
  7. 2
      modules/core/test/test_intrin_utils.hpp
  8. 34
      modules/imgproc/src/contours.cpp
  9. 2
      modules/imgproc/src/resize.cpp
  10. 8
      modules/imgproc/src/sumpixels.cpp

@ -13,8 +13,9 @@ ocv_add_dispatched_file(split SSE2 AVX2)
ocv_add_dispatched_file(sum SSE2 AVX2)
# dispatching for accuracy tests
ocv_add_dispatched_file_force_all(test_intrin128 TEST SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX FP16 AVX2)
ocv_add_dispatched_file_force_all(test_intrin256 TEST AVX2)
ocv_add_dispatched_file_force_all(test_intrin128 TEST SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX FP16 AVX2 AVX512_SKX)
ocv_add_dispatched_file_force_all(test_intrin256 TEST AVX2 AVX512_SKX)
ocv_add_dispatched_file_force_all(test_intrin512 TEST AVX512_SKX)
ocv_add_module(core
OPTIONAL opencv_cudev

@ -180,6 +180,18 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
#endif
// AVX512 can be used together with SSE2 and AVX2, so
// we define those sets of intrinsics at once.
// For some of AVX512 intrinsics get v512_ prefix instead of v_, e.g. v512_load() vs v_load().
// Wide intrinsics will be mapped to v512_ counterparts in this case(e.g. vx_load() => v512_load())
#if CV_AVX512_SKX
#define CV__SIMD_FORWARD 512
#include "opencv2/core/hal/intrin_forward.hpp"
#include "opencv2/core/hal/intrin_avx512.hpp"
#endif
//! @cond IGNORED
namespace cv {
@ -321,13 +333,41 @@ template<typename _Tp> struct V_RegTraits
CV_DEF_REG_TRAITS(v256, v_float64x4, double, f64, v_float64x4, void, void, v_int64x4, v_int32x8);
#endif
#if CV_SIMD512
CV_DEF_REG_TRAITS(v512, v_uint8x64, uchar, u8, v_uint8x64, v_uint16x32, v_uint32x16, v_int8x64, void);
CV_DEF_REG_TRAITS(v512, v_int8x64, schar, s8, v_uint8x64, v_int16x32, v_int32x16, v_int8x64, void);
CV_DEF_REG_TRAITS(v512, v_uint16x32, ushort, u16, v_uint16x32, v_uint32x16, v_uint64x8, v_int16x32, void);
CV_DEF_REG_TRAITS(v512, v_int16x32, short, s16, v_uint16x32, v_int32x16, v_int64x8, v_int16x32, void);
CV_DEF_REG_TRAITS(v512, v_uint32x16, unsigned, u32, v_uint32x16, v_uint64x8, void, v_int32x16, void);
CV_DEF_REG_TRAITS(v512, v_int32x16, int, s32, v_uint32x16, v_int64x8, void, v_int32x16, void);
CV_DEF_REG_TRAITS(v512, v_float32x16, float, f32, v_float32x16, v_float64x8, void, v_int32x16, v_int32x16);
CV_DEF_REG_TRAITS(v512, v_uint64x8, uint64, u64, v_uint64x8, void, void, v_int64x8, void);
CV_DEF_REG_TRAITS(v512, v_int64x8, int64, s64, v_uint64x8, void, void, v_int64x8, void);
CV_DEF_REG_TRAITS(v512, v_float64x8, double, f64, v_float64x8, void, void, v_int64x8, v_int32x16);
#endif
#if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
#define CV__SIMD_NAMESPACE simd512
namespace CV__SIMD_NAMESPACE {
#define CV_SIMD 1
#define CV_SIMD_64F CV_SIMD512_64F
#define CV_SIMD_FP16 CV_SIMD512_FP16
#define CV_SIMD_WIDTH 64
// TODO typedef v_uint8 / v_int32 / etc types here
typedef v_uint8x64 v_uint8;
typedef v_int8x64 v_int8;
typedef v_uint16x32 v_uint16;
typedef v_int16x32 v_int16;
typedef v_uint32x16 v_uint32;
typedef v_int32x16 v_int32;
typedef v_uint64x8 v_uint64;
typedef v_int64x8 v_int64;
typedef v_float32x16 v_float32;
CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v512)
#if CV_SIMD512_64F
typedef v_float64x8 v_float64;
CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v512, load)
#endif
inline void vx_cleanup() { v512_cleanup(); }
} // namespace
using namespace CV__SIMD_NAMESPACE;
#elif CV_SIMD256 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 256)

File diff suppressed because it is too large Load Diff

@ -14,9 +14,32 @@ namespace cv
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
/** Types **/
#if CV__SIMD_FORWARD == 512
// [todo] 512
#error "AVX512 Not implemented yet"
#if CV__SIMD_FORWARD == 1024
// [todo] 1024
#error "1024-long ops not implemented yet"
#elif CV__SIMD_FORWARD == 512
// 512
#define __CV_VX(fun) v512_##fun
#define __CV_V_UINT8 v_uint8x64
#define __CV_V_INT8 v_int8x64
#define __CV_V_UINT16 v_uint16x32
#define __CV_V_INT16 v_int16x32
#define __CV_V_UINT32 v_uint32x16
#define __CV_V_INT32 v_int32x16
#define __CV_V_UINT64 v_uint64x8
#define __CV_V_INT64 v_int64x8
#define __CV_V_FLOAT32 v_float32x16
#define __CV_V_FLOAT64 v_float64x8
struct v_uint8x64;
struct v_int8x64;
struct v_uint16x32;
struct v_int16x32;
struct v_uint32x16;
struct v_int32x16;
struct v_uint64x8;
struct v_int64x8;
struct v_float32x16;
struct v_float64x8;
#elif CV__SIMD_FORWARD == 256
// 256
#define __CV_VX(fun) v256_##fun

@ -7,11 +7,15 @@
#include "test_intrin128.simd_declarations.hpp"
#undef CV_CPU_DISPATCH_MODES_ALL
#include "opencv2/core/cv_cpu_dispatch.h"
#include "test_intrin256.simd.hpp"
#include "test_intrin256.simd_declarations.hpp"
#undef CV_CPU_DISPATCH_MODES_ALL
#include "opencv2/core/cv_cpu_dispatch.h"
#include "test_intrin512.simd.hpp"
#include "test_intrin512.simd_declarations.hpp"
#ifdef _MSC_VER
# pragma warning(disable:4702) // unreachable code
#endif
@ -30,6 +34,11 @@ namespace opencv_test { namespace hal {
throw SkipTestException("SIMD256 (" #cpu_opt ") is not available or disabled"); \
} while(0)
#define DISPATCH_SIMD512(fn, cpu_opt) do { \
CV_CPU_CALL_ ## cpu_opt ## _(fn, ()); \
throw SkipTestException("SIMD512 (" #cpu_opt ") is not available or disabled"); \
} while(0)
#define DEFINE_SIMD_TESTS(simd_size, cpu_opt) \
TEST(hal_intrin ## simd_size, uint8x16_ ## cpu_opt) { DISPATCH_SIMD ## simd_size(test_hal_intrin_uint8, cpu_opt); } \
TEST(hal_intrin ## simd_size, int8x16_ ## cpu_opt) { DISPATCH_SIMD ## simd_size(test_hal_intrin_int8, cpu_opt); } \
@ -67,6 +76,9 @@ DEFINE_SIMD_TESTS(128, AVX)
#if defined CV_CPU_DISPATCH_COMPILE_AVX2 || defined CV_CPU_BASELINE_COMPILE_AVX2
DEFINE_SIMD_TESTS(128, AVX2)
#endif
#if defined CV_CPU_DISPATCH_COMPILE_AVX512_SKX || defined CV_CPU_BASELINE_COMPILE_AVX512_SKX
DEFINE_SIMD_TESTS(128, AVX512_SKX)
#endif
TEST(hal_intrin128, float16x8_FP16)
{
@ -91,6 +103,10 @@ namespace intrin256 {
DEFINE_SIMD_TESTS(256, AVX2)
#endif
#if defined CV_CPU_DISPATCH_COMPILE_AVX512_SKX || defined CV_CPU_BASELINE_COMPILE_AVX512_SKX
DEFINE_SIMD_TESTS(256, AVX512_SKX)
#endif
TEST(hal_intrin256, float16x16_FP16)
{
//CV_CPU_CALL_FP16_(test_hal_intrin_float16, ());
@ -101,4 +117,19 @@ TEST(hal_intrin256, float16x16_FP16)
} // namespace intrin256
namespace intrin512 {
#if defined CV_CPU_DISPATCH_COMPILE_AVX512_SKX || defined CV_CPU_BASELINE_COMPILE_AVX512_SKX
DEFINE_SIMD_TESTS(512, AVX512_SKX)
#endif
TEST(hal_intrin512, float16x32_FP16)
{
CV_CPU_CALL_AVX512_SKX_(test_hal_intrin_float16, ());
throw SkipTestException("Unsupported hardware: FP16 is not available");
}
} // namespace intrin512
}} // namespace

@ -0,0 +1,23 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#if !defined CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY && \
!defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS // TODO? C++ fallback implementation for SIMD512
#define CV__SIMD_FORCE_WIDTH 512
#include "opencv2/core/hal/intrin.hpp"
#undef CV__SIMD_FORCE_WIDTH
#if CV_SIMD_WIDTH != 64
#error "Invalid build configuration"
#endif
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
namespace opencv_test { namespace hal { namespace intrin512 {
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
#include "test_intrin_utils.hpp"
CV_CPU_OPTIMIZATION_NAMESPACE_END
}}} //namespace

@ -811,7 +811,9 @@ template<typename R> struct TheTest
R a = dataA, b = dataB, c = dataC, d = dataD, e = dataE;
EXPECT_EQ(2, v_signmask(a));
#if CV_SIMD_WIDTH <= 32
EXPECT_EQ(2 | (1 << (R::nlanes / 2)) | (1 << (R::nlanes - 1)), v_signmask(b));
#endif
EXPECT_EQ(false, v_check_all(a));
EXPECT_EQ(false, v_check_all(b));

@ -1061,10 +1061,16 @@ cvFindNextContour( CvContourScanner scanner )
}
else
{
v_uint8 v_prev = vx_setall_u8((uchar)prev);
for (; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
#if CV_SIMD_WIDTH > 16
v_uint8 vx_prev = vx_setall_u8((uchar)prev);
while (x <= width - v_uint8::nlanes &&
v_check_all(vx_load((uchar*)(img + x)) == vx_prev))
x += v_uint8::nlanes;
#endif
v_uint8x16 v_prev = v_setall_u8((uchar)prev);
for (; x <= width - v_uint8x16::nlanes; x += v_uint8x16::nlanes)
{
unsigned int mask = (unsigned int)v_signmask(vx_load((uchar*)(img + x)) != v_prev);
unsigned int mask = (unsigned int)v_signmask(v_load((uchar*)(img + x)) != v_prev);
if (mask)
{
p = img[(x += cv::trailingZeros32(mask))];
@ -1328,10 +1334,16 @@ CvLinkedRunPoint;
inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j)
{
#if CV_SIMD
v_uint8 v_zero = vx_setzero_u8();
for (; j <= img_size.width - v_uint8::nlanes; j += v_uint8::nlanes)
#if CV_SIMD_WIDTH > 16
v_uint8 vx_zero = vx_setzero_u8();
while (j <= img_size.width - v_uint8::nlanes &&
v_check_all(vx_load((uchar*)(src_data + j)) == vx_zero))
j += v_uint8::nlanes;
#endif
v_uint8x16 v_zero = v_setzero_u8();
for (; j <= img_size.width - v_uint8x16::nlanes; j += v_uint8x16::nlanes)
{
unsigned int mask = (unsigned int)v_signmask(vx_load((uchar*)(src_data + j)) != v_zero);
unsigned int mask = (unsigned int)v_signmask(v_load((uchar*)(src_data + j)) != v_zero);
if (mask)
{
j += cv::trailingZeros32(mask);
@ -1353,10 +1365,16 @@ inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j)
}
else
{
v_uint8 v_zero = vx_setzero_u8();
#if CV_SIMD_WIDTH > 16
v_uint8 vx_zero = vx_setzero_u8();
while (j <= img_size.width - v_uint8::nlanes &&
v_check_all(vx_load((uchar*)(src_data + j)) != vx_zero))
j += v_uint8::nlanes;
#endif
v_uint8x16 v_zero = v_setzero_u8();
for (; j <= img_size.width - v_uint8::nlanes; j += v_uint8::nlanes)
{
unsigned int mask = (unsigned int)v_signmask(vx_load((uchar*)(src_data + j)) == v_zero);
unsigned int mask = (unsigned int)v_signmask(v_load((uchar*)(src_data + j)) == v_zero);
if (mask)
{
j += cv::trailingZeros32(mask);

@ -2148,6 +2148,7 @@ public:
v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
bl = s0 + s3; gl = s1 + s4; rl = s2 + s5;
#elif CV_SIMD_WIDTH == 64
v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
#endif
@ -2167,6 +2168,7 @@ public:
v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
bh = s0 + s3; gh = s1 + s4; rh = s2 + s5;
#elif CV_SIMD_WIDTH == 64
v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
#endif

@ -127,7 +127,7 @@ struct Integral_SIMD<uchar, int, double>
{
v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
v_int32 el4l, el4h;
#if CV_AVX2
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 2));
vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 4));
vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 8));
@ -138,7 +138,7 @@ struct Integral_SIMD<uchar, int, double>
#else
el8 += v_rotate_left<1>(el8);
el8 += v_rotate_left<2>(el8);
#if CV_SIMD_WIDTH == 32
#if CV_SIMD_WIDTH >= 32
el8 += v_rotate_left<4>(el8);
#if CV_SIMD_WIDTH == 64
el8 += v_rotate_left<8>(el8);
@ -194,7 +194,7 @@ struct Integral_SIMD<uchar, float, double>
{
v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
v_float32 el4l, el4h;
#if CV_AVX2
#if CV_AVX2 && CV_SIMD_WIDTH == 32
__m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 2));
vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 4));
vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 8));
@ -205,7 +205,7 @@ struct Integral_SIMD<uchar, float, double>
#else
el8 += v_rotate_left<1>(el8);
el8 += v_rotate_left<2>(el8);
#if CV_SIMD_WIDTH == 32
#if CV_SIMD_WIDTH >= 32
el8 += v_rotate_left<4>(el8);
#if CV_SIMD_WIDTH == 64
el8 += v_rotate_left<8>(el8);

Loading…
Cancel
Save