used popcnt

pull/3591/head
Ilya Lavrenov 10 years ago
parent 31827d8dfe
commit fc0869735d
  1. 2
      CMakeLists.txt
  2. 8
      cmake/OpenCVCompilerOptions.cmake
  3. 1
      modules/core/include/opencv2/core/base.hpp
  4. 16
      modules/core/include/opencv2/core/cvdef.h
  5. 77
      modules/core/include/opencv2/core/sse_utils.hpp
  6. 1
      modules/core/include/opencv2/core/utility.hpp
  7. 1
      modules/core/src/arithm.cpp
  8. 233
      modules/core/src/convert.cpp
  9. 1
      modules/core/src/copy.cpp
  10. 1
      modules/core/src/mathfuncs.cpp
  11. 1
      modules/core/src/matmul.cpp
  12. 10
      modules/core/src/stat.cpp
  13. 1
      modules/core/src/system.cpp
  14. 3
      modules/core/src/umatrix.cpp
  15. 1
      modules/imgproc/src/accum.cpp
  16. 1
      modules/imgproc/src/canny.cpp
  17. 1
      modules/imgproc/src/clahe.cpp
  18. 1
      modules/imgproc/src/color.cpp
  19. 1
      modules/imgproc/src/corner.cpp
  20. 1
      modules/imgproc/src/demosaicing.cpp
  21. 1
      modules/imgproc/src/imgwarp.cpp
  22. 1
      modules/imgproc/src/pyramids.cpp
  23. 1
      modules/imgproc/src/smooth.cpp
  24. 1
      modules/imgproc/src/sumpixels.cpp
  25. 6
      modules/ts/src/ts_func.cpp

@ -221,9 +221,9 @@ OCV_OPTION(ENABLE_SSE3 "Enable SSE3 instructions"
OCV_OPTION(ENABLE_SSSE3 "Enable SSSE3 instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
OCV_OPTION(ENABLE_SSE41 "Enable SSE4.1 instructions" OFF IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
OCV_OPTION(ENABLE_SSE42 "Enable SSE4.2 instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
OCV_OPTION(ENABLE_FMA3 "Enable FMA3 instructions" ON IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
OCV_OPTION(ENABLE_AVX "Enable AVX instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
OCV_OPTION(ENABLE_AVX2 "Enable AVX2 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
OCV_OPTION(ENABLE_FMA3 "Enable FMA3 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
OCV_OPTION(ENABLE_NEON "Enable NEON instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) )
OCV_OPTION(ENABLE_VFPV3 "Enable VFPv3-D32 instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) )
OCV_OPTION(ENABLE_NOISY_WARNINGS "Show all warnings even if they are too noisy" OFF )

@ -145,6 +145,10 @@ if(CMAKE_COMPILER_IS_GNUCXX)
endif()
if(ENABLE_AVX2)
add_extra_compiler_option(-mavx2)
if(ENABLE_FMA3)
add_extra_compiler_option(-mfma)
endif()
endif()
# GCC depresses SSEx instructions when -mavx is used. Instead, it generates new AVX instructions or AVX equivalence for all SSEx instructions when needed.
@ -165,10 +169,6 @@ if(CMAKE_COMPILER_IS_GNUCXX)
add_extra_compiler_option(-msse4.2)
endif()
endif()
if(ENABLE_FMA3)
add_extra_compiler_option(-mfma)
endif()
endif(NOT MINGW)
if(X86 OR X86_64)

@ -13,6 +13,7 @@
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Copyright (C) 2014, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,

@ -13,6 +13,7 @@
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Copyright (C) 2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
@ -157,15 +158,11 @@
# include <nmmintrin.h>
# define CV_SSE4_2 1
# endif
# if defined __FMA__ || (defined _MSC_VER && _MSC_VER >= 1500)
# include <immintrin.h>
# define CV_FMA3 1
# endif
# if defined __POPCNT__ || (defined _MSC_VER && _MSC_VER >= 1500)
# include <popcntintrin.h>
# define CV_POPCNT 1
# endif
# if defined __AVX__ || defined __AVX2__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219)
# if defined __AVX__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219)
// MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX
// See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32
# include <immintrin.h>
@ -179,6 +176,9 @@
# if defined __AVX2__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219)
# include <immintrin.h>
# define CV_AVX2 1
# if defined __FMA__
# define CV_FMA3 1
# endif
# endif
#endif
@ -194,6 +194,9 @@
#endif // __CUDACC__
#ifndef CV_POPCNT
#define CV_POPCNT 0
#endif
#ifndef CV_MMX
# define CV_MMX 0
#endif
@ -221,9 +224,6 @@
#ifndef CV_AVX2
# define CV_AVX2 0
#endif
#ifndef CV_POPCNT
#define CV_POPCNT 0
#endif
#ifndef CV_FMA3
# define CV_FMA3 0
#endif

@ -10,7 +10,7 @@
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2015, OpenCV Foundation, all rights reserved.
// Copyright (C) 2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
@ -48,6 +48,34 @@
#if CV_SSE2
inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
{
__m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g0);
__m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g0);
__m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_g1);
__m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_g1);
__m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk2);
__m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk2);
__m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk3);
__m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk3);
__m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk2);
__m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk2);
__m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk3);
__m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk3);
__m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk2);
__m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk2);
__m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk3);
__m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk3);
v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk2);
v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk2);
v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk3);
v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk3);
}
inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
{
@ -228,6 +256,29 @@ inline void _mm_interleavee_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8));
}
inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
{
__m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g0);
__m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g0);
__m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_g1);
__m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_g1);
__m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk2);
__m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk2);
__m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk3);
__m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk3);
__m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk2);
__m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk2);
__m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk3);
__m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk3);
v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk2);
v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk2);
v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk3);
v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk3);
}
inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
{
@ -300,6 +351,8 @@ inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g
v_a1 = _mm_unpackhi_epi16(layer3_chunk3, layer3_chunk7);
}
#if CV_SSE4_1
inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
{
@ -376,6 +429,26 @@ inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
v_a1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk6, 16), _mm_srli_epi32(layer1_chunk7, 16));
}
#endif // CV_SSE4_1
inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
{
__m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g0);
__m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g0);
__m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_g1);
__m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_g1);
__m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk2);
__m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk2);
__m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk3);
__m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk3);
v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk2);
v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk2);
v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk3);
v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk3);
}
inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
__m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
{
@ -492,6 +565,6 @@ inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m12
v_a1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_hi);
}
#endif
#endif // CV_SSE2
#endif //__OPENCV_CORE_SSE_UTILS_HPP__

@ -13,6 +13,7 @@
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Copyright (C) 2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,

@ -12,6 +12,7 @@
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,

@ -12,6 +12,7 @@
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
@ -62,8 +63,11 @@ template<typename T> struct VSplit4;
#define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type>{ \
void operator()(const data_type* src, data_type* dst0, data_type* dst1){ \
struct name<data_type> \
{ \
void operator()(const data_type* src, data_type* dst0, \
data_type* dst1) const \
{ \
reg_type r = load_func(src); \
store_func(dst0, r.val[0]); \
store_func(dst1, r.val[1]); \
@ -72,9 +76,11 @@ template<typename T> struct VSplit4;
#define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type>{ \
struct name<data_type> \
{ \
void operator()(const data_type* src, data_type* dst0, data_type* dst1, \
data_type* dst2){ \
data_type* dst2) const \
{ \
reg_type r = load_func(src); \
store_func(dst0, r.val[0]); \
store_func(dst1, r.val[1]); \
@ -84,9 +90,11 @@ template<typename T> struct VSplit4;
#define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type>{ \
struct name<data_type> \
{ \
void operator()(const data_type* src, data_type* dst0, data_type* dst1, \
data_type* dst2, data_type* dst3){ \
data_type* dst2, data_type* dst3) const \
{ \
reg_type r = load_func(src); \
store_func(dst0, r.val[0]); \
store_func(dst1, r.val[1]); \
@ -96,28 +104,174 @@ template<typename T> struct VSplit4;
}
SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar , uint8x16x2_t, vld2q_u8 , vst1q_u8 );
SPLIT2_KERNEL_TEMPLATE(VSplit2, schar , int8x16x2_t, vld2q_s8 , vst1q_s8 );
SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort, uint16x8x2_t, vld2q_u16, vst1q_u16);
SPLIT2_KERNEL_TEMPLATE(VSplit2, short , int16x8x2_t, vld2q_s16, vst1q_s16);
SPLIT2_KERNEL_TEMPLATE(VSplit2, int , int32x4x2_t, vld2q_s32, vst1q_s32);
SPLIT2_KERNEL_TEMPLATE(VSplit2, float , float32x4x2_t, vld2q_f32, vst1q_f32);
SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 , int64x1x2_t, vld2_s64 , vst1_s64 );
SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar , uint8x16x3_t, vld3q_u8 , vst1q_u8 );
SPLIT3_KERNEL_TEMPLATE(VSplit3, schar , int8x16x3_t, vld3q_s8 , vst1q_s8 );
SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort, uint16x8x3_t, vld3q_u16, vst1q_u16);
SPLIT3_KERNEL_TEMPLATE(VSplit3, short , int16x8x3_t, vld3q_s16, vst1q_s16);
SPLIT3_KERNEL_TEMPLATE(VSplit3, int , int32x4x3_t, vld3q_s32, vst1q_s32);
SPLIT3_KERNEL_TEMPLATE(VSplit3, float , float32x4x3_t, vld3q_f32, vst1q_f32);
SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 , int64x1x3_t, vld3_s64 , vst1_s64 );
SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar , uint8x16x4_t, vld4q_u8 , vst1q_u8 );
SPLIT4_KERNEL_TEMPLATE(VSplit4, schar , int8x16x4_t, vld4q_s8 , vst1q_s8 );
SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort, uint16x8x4_t, vld4q_u16, vst1q_u16);
SPLIT4_KERNEL_TEMPLATE(VSplit4, short , int16x8x4_t, vld4q_s16, vst1q_s16);
SPLIT4_KERNEL_TEMPLATE(VSplit4, int , int32x4x4_t, vld4q_s32, vst1q_s32);
SPLIT4_KERNEL_TEMPLATE(VSplit4, float , float32x4x4_t, vld4q_f32, vst1q_f32);
SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 , int64x1x4_t, vld4_s64 , vst1_s64 );
#elif CV_SSE2
template <typename T>
struct VSplit2
{
VSplit2() : support(false) { }
void operator()(const T *, T *, T *) const { }
bool support;
};
template <typename T>
struct VSplit3
{
VSplit3() : support(false) { }
void operator()(const T *, T *, T *, T *) const { }
bool support;
};
template <typename T>
struct VSplit4
{
VSplit4() : support(false) { }
void operator()(const T *, T *, T *, T *, T *) const { }
bool support;
};
#define SPLIT2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \
template <> \
struct VSplit2<data_type> \
{ \
enum \
{ \
ELEMS_IN_VEC = 16 / sizeof(data_type) \
}; \
\
VSplit2() \
{ \
support = true; \
} \
\
void operator()(const data_type * src, \
data_type * dst0, data_type * dst1) const \
{ \
reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
\
_mm_deinterleave(v_src0, v_src1, v_src2, v_src3); \
\
_mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
_mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
_mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
_mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
} \
\
bool support; \
}
#define SPLIT3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \
template <> \
struct VSplit3<data_type> \
{ \
enum \
{ \
ELEMS_IN_VEC = 16 / sizeof(data_type) \
}; \
\
VSplit3() \
{ \
support = true; \
} \
\
void operator()(const data_type * src, \
data_type * dst0, data_type * dst1, data_type * dst2) const \
{ \
reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
\
_mm_deinterleave(v_src0, v_src1, v_src2, \
v_src3, v_src4, v_src5); \
\
_mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
_mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
_mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
_mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
_mm_storeu_##flavor((cast_type *)(dst2), v_src4); \
_mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \
} \
\
bool support; \
}
#define SPLIT4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \
template <> \
struct VSplit4<data_type> \
{ \
enum \
{ \
ELEMS_IN_VEC = 16 / sizeof(data_type) \
}; \
\
VSplit4() \
{ \
support = true; \
} \
\
void operator()(const data_type * src, data_type * dst0, data_type * dst1, \
data_type * dst2, data_type * dst3) const \
{ \
reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
reg_type v_src6 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 6)); \
reg_type v_src7 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 7)); \
\
_mm_deinterleave(v_src0, v_src1, v_src2, v_src3, \
v_src4, v_src5, v_src6, v_src7); \
\
_mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
_mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
_mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
_mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
_mm_storeu_##flavor((cast_type *)(dst2), v_src4); \
_mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \
_mm_storeu_##flavor((cast_type *)(dst3), v_src6); \
_mm_storeu_##flavor((cast_type *)(dst3 + ELEMS_IN_VEC), v_src7); \
} \
\
bool support; \
}
SPLIT2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
SPLIT2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
SPLIT2_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps);
SPLIT3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
SPLIT3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
SPLIT3_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps);
SPLIT4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
SPLIT4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
SPLIT4_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps);
#endif
template<typename T> static void
@ -154,6 +308,19 @@ split_( const T* src, T** dst, int len, int cn )
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i);
}
#elif CV_SSE2
if (cn == 2)
{
int inc_i = 32/sizeof(T);
int inc_j = 2 * inc_i;
VSplit2<T> vsplit;
if (vsplit.support)
{
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i);
}
}
#endif
for( ; i < len; i++, j += cn )
{
@ -176,6 +343,20 @@ split_( const T* src, T** dst, int len, int cn )
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
}
#elif CV_SSE2
if (cn == 3)
{
int inc_i = 32/sizeof(T);
int inc_j = 3 * inc_i;
VSplit3<T> vsplit;
if (vsplit.support)
{
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
}
}
#endif
for( ; i < len; i++, j += cn )
{
@ -199,6 +380,19 @@ split_( const T* src, T** dst, int len, int cn )
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
}
#elif CV_SSE2
if (cn == 4)
{
int inc_i = 32/sizeof(T);
int inc_j = 4 * inc_i;
VSplit4<T> vsplit;
if (vsplit.support)
{
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
}
}
#endif
for( ; i < len; i++, j += cn )
{
@ -265,27 +459,18 @@ template<typename T> struct VMerge4;
}
MERGE2_KERNEL_TEMPLATE(VMerge2, uchar , uint8x16x2_t, vld1q_u8 , vst2q_u8 );
MERGE2_KERNEL_TEMPLATE(VMerge2, schar , int8x16x2_t, vld1q_s8 , vst2q_s8 );
MERGE2_KERNEL_TEMPLATE(VMerge2, ushort, uint16x8x2_t, vld1q_u16, vst2q_u16);
MERGE2_KERNEL_TEMPLATE(VMerge2, short , int16x8x2_t, vld1q_s16, vst2q_s16);
MERGE2_KERNEL_TEMPLATE(VMerge2, int , int32x4x2_t, vld1q_s32, vst2q_s32);
MERGE2_KERNEL_TEMPLATE(VMerge2, float , float32x4x2_t, vld1q_f32, vst2q_f32);
MERGE2_KERNEL_TEMPLATE(VMerge2, int64 , int64x1x2_t, vld1_s64 , vst2_s64 );
MERGE3_KERNEL_TEMPLATE(VMerge3, uchar , uint8x16x3_t, vld1q_u8 , vst3q_u8 );
MERGE3_KERNEL_TEMPLATE(VMerge3, schar , int8x16x3_t, vld1q_s8 , vst3q_s8 );
MERGE3_KERNEL_TEMPLATE(VMerge3, ushort, uint16x8x3_t, vld1q_u16, vst3q_u16);
MERGE3_KERNEL_TEMPLATE(VMerge3, short , int16x8x3_t, vld1q_s16, vst3q_s16);
MERGE3_KERNEL_TEMPLATE(VMerge3, int , int32x4x3_t, vld1q_s32, vst3q_s32);
MERGE3_KERNEL_TEMPLATE(VMerge3, float , float32x4x3_t, vld1q_f32, vst3q_f32);
MERGE3_KERNEL_TEMPLATE(VMerge3, int64 , int64x1x3_t, vld1_s64 , vst3_s64 );
MERGE4_KERNEL_TEMPLATE(VMerge4, uchar , uint8x16x4_t, vld1q_u8 , vst4q_u8 );
MERGE4_KERNEL_TEMPLATE(VMerge4, schar , int8x16x4_t, vld1q_s8 , vst4q_s8 );
MERGE4_KERNEL_TEMPLATE(VMerge4, ushort, uint16x8x4_t, vld1q_u16, vst4q_u16);
MERGE4_KERNEL_TEMPLATE(VMerge4, short , int16x8x4_t, vld1q_s16, vst4q_s16);
MERGE4_KERNEL_TEMPLATE(VMerge4, int , int32x4x4_t, vld1q_s32, vst4q_s32);
MERGE4_KERNEL_TEMPLATE(VMerge4, float , float32x4x4_t, vld1q_f32, vst4q_f32);
MERGE4_KERNEL_TEMPLATE(VMerge4, int64 , int64x1x4_t, vld1_s64 , vst4_s64 );
#endif

@ -11,6 +11,7 @@
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,

@ -12,6 +12,7 @@
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,

@ -12,6 +12,7 @@
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,

@ -12,6 +12,7 @@
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
@ -404,13 +405,20 @@ static const uchar * initPopcountTable()
{
// we compute inverse popcount table,
// since we pass (img[x] == 0) mask as index in the table.
for( int j = 0; j < 256; j++ )
unsigned int j = 0u;
#if CV_POPCNT
if (checkHardwareSupport(CV_CPU_POPCNT))
for( ; j < 256u; j++ )
tab[j] = (uchar)(8 - _mm_popcnt_u32(j));
#else
for( ; j < 256u; j++ )
{
int val = 0;
for( int mask = 1; mask < 256; mask += mask )
val += (j & mask) == 0;
tab[j] = (uchar)val;
}
#endif
initialized = true;
}

@ -12,6 +12,7 @@
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,

@ -10,8 +10,7 @@
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,

@ -12,6 +12,7 @@
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
/
// Redistribution and use in source and binary forms, with or without modification,

@ -11,6 +11,7 @@
// For Open Source Computer Vision Library
//
// Copyright (C) 2000, Intel Corporation, all rights reserved.
// Copyright (C) 2014, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,

@ -11,6 +11,7 @@
// For Open Source Computer Vision Library
//
// Copyright (C) 2013, NVIDIA Corporation, all rights reserved.
// Copyright (C) 2014, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,

@ -12,6 +12,7 @@
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,

@ -12,6 +12,7 @@
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,

@ -12,6 +12,7 @@
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,

@ -12,6 +12,7 @@
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,

@ -12,6 +12,7 @@
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,

@ -12,6 +12,7 @@
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,

@ -12,6 +12,7 @@
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,

@ -2998,12 +2998,12 @@ void printVersionInfo(bool useStdOut)
std::string cpu_features;
#if CV_MMX
if (checkHardwareSupport(CV_CPU_MMX)) cpu_features += " mmx";
#endif
#if CV_POPCNT
if (checkHardwareSupport(CV_CPU_POPCNT)) cpu_features += " popcnt";
#endif
#if CV_MMX
if (checkHardwareSupport(CV_CPU_MMX)) cpu_features += " mmx";
#endif
#if CV_SSE
if (checkHardwareSupport(CV_CPU_SSE)) cpu_features += " sse";
#endif

Loading…
Cancel
Save