From 8d48632ebef60e7c4d92b5c9d6549f8e1623010a Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:28 +0300 Subject: [PATCH] avx2 --- CMakeLists.txt | 1 + cmake/OpenCVCompilerOptions.cmake | 3 + modules/core/include/opencv2/core/cvdef.h | 11 +++- modules/core/src/convert.cpp | 72 +++++++++++------------ modules/core/src/precomp.hpp | 1 + modules/core/src/system.cpp | 53 ++++++++++++++++- modules/ts/src/ts_func.cpp | 3 + 7 files changed, 105 insertions(+), 39 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 75fcf9659b..7b5648efd4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -221,6 +221,7 @@ OCV_OPTION(ENABLE_SSSE3 "Enable SSSE3 instructions" OCV_OPTION(ENABLE_SSE41 "Enable SSE4.1 instructions" OFF IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSE42 "Enable SSE4.2 instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_AVX "Enable AVX instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) +OCV_OPTION(ENABLE_AVX2 "Enable AVX2 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_NEON "Enable NEON instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) ) OCV_OPTION(ENABLE_VFPV3 "Enable VFPv3-D32 instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) ) OCV_OPTION(ENABLE_NOISY_WARNINGS "Show all warnings even if they are too noisy" OFF ) diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake index 2f9068c60d..831026fb50 100644 --- a/cmake/OpenCVCompilerOptions.cmake +++ b/cmake/OpenCVCompilerOptions.cmake @@ -140,6 +140,9 @@ if(CMAKE_COMPILER_IS_GNUCXX) if(ENABLE_AVX) add_extra_compiler_option(-mavx) endif() + if(ENABLE_AVX2) + add_extra_compiler_option(-mavx2) + endif() # GCC depresses SSEx instructions when -mavx is used. Instead, it generates new AVX instructions or AVX equivalence for all SSEx instructions when needed. if(NOT OPENCV_EXTRA_CXX_FLAGS MATCHES "-mavx") diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index 06894d7a5d..c52cb021cb 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -114,7 +114,8 @@ #define CV_CPU_SSE4_2 7 #define CV_CPU_POPCNT 8 #define CV_CPU_AVX 10 -#define CV_CPU_NEON 11 +#define CV_CPU_AVX2 11 +#define CV_CPU_NEON 12 // when adding to this list remember to update the enum in core/utility.cpp #define CV_HARDWARE_MAX_FEATURE 255 @@ -141,7 +142,7 @@ # include # define CV_SSE4_2 1 # endif -# if defined __AVX__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219) +# if defined __AVX__ || defined __AVX2__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219) // MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX // See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32 # include @@ -150,6 +151,9 @@ # define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK) # else # define __xgetbv() 0 +# ifdef __AVX2__ +# define CV_AVX2 1 +# endif # endif # endif #endif @@ -187,6 +191,9 @@ #ifndef CV_AVX # define CV_AVX 0 #endif +#ifndef CV_AVX2 +# define CV_AVX2 0 +#endif #ifndef CV_NEON # define CV_NEON 0 #endif diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index 829b984c9f..55f08f1bde 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -2294,26 +2294,44 @@ cvtScale_( const short* src, size_t sstep, { int x = 0; - #if CV_SSE2 - if(USE_SSE2)//~5X + #if CV_AVX2 + if (USE_AVX2) + { + __m256 scale256 = _mm256_set1_ps (scale); + __m256 shift256 = _mm256_set1_ps (shift); + __m256i zero = _mm256_setzero_si256(); + for ( ; x <= size.width - 16; x += 16) { - __m128 scale128 = _mm_set1_ps (scale); - __m128 shift128 = _mm_set1_ps (shift); - for(; x <= size.width - 8; x += 8 ) - { - __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x)); - __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4)); - __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16)); - __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16)); - rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128); - rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128); - r0 = _mm_cvtps_epi32(rf0); - r1 = _mm_cvtps_epi32(rf1); - - _mm_storeu_si128((__m128i*)(dst + x), r0); - _mm_storeu_si128((__m128i*)(dst + x + 4), r1); - } + __m256i v_src = _mm256_loadu_si256((__m256i const *)(src + x)); + __m256i v_src_lo = _mm256_unpacklo_epi16(v_src, zero); + __m256i v_src_hi = _mm256_unpackhi_epi16(v_src, zero); + __m256 v_dst0 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (v_src_lo), scale256), shift256); + __m256 v_dst1 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (v_src_hi), scale256), shift256); + _mm256_storeu_si256 ((__m256i *)(dst + x), _mm256_cvtps_epi32(v_dst0)); + _mm256_storeu_si256 ((__m256i *)(dst + x + 8), _mm256_cvtps_epi32(v_dst1)); + } + } + #endif + #if CV_SSE2 + if (USE_SSE2)//~5X + { + __m128 scale128 = _mm_set1_ps (scale); + __m128 shift128 = _mm_set1_ps (shift); + for(; x <= size.width - 8; x += 8 ) + { + __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x)); + __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4)); + __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16)); + __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16)); + rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128); + rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128); + r0 = _mm_cvtps_epi32(rf0); + r1 = _mm_cvtps_epi32(rf1); + + _mm_storeu_si128((__m128i*)(dst + x), r0); + _mm_storeu_si128((__m128i*)(dst + x + 4), r1); } + } #elif CV_NEON float32x4_t v_shift = vdupq_n_f32(shift); for(; x <= size.width - 8; x += 8 ) @@ -2330,24 +2348,6 @@ cvtScale_( const short* src, size_t sstep, } #endif - //We will wait Haswell - /* - #if CV_AVX - if(USE_AVX)//2X - bad variant - { - ////TODO:AVX implementation (optimization?) required - __m256 scale256 = _mm256_set1_ps (scale); - __m256 shift256 = _mm256_set1_ps (shift); - for(; x <= size.width - 8; x += 8 ) - { - __m256i buf = _mm256_set_epi32((int)(*(src+x+7)),(int)(*(src+x+6)),(int)(*(src+x+5)),(int)(*(src+x+4)),(int)(*(src+x+3)),(int)(*(src+x+2)),(int)(*(src+x+1)),(int)(*(src+x))); - __m256 r0 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (buf), scale256), shift256); - __m256i res = _mm256_cvtps_epi32(r0); - _mm256_storeu_si256 ((__m256i*)(dst+x), res); - } - } - #endif*/ - for(; x < size.width; x++ ) dst[x] = saturate_cast(src[x]*scale + shift); } diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp index ef154400e2..0f85cc5568 100644 --- a/modules/core/src/precomp.hpp +++ b/modules/core/src/precomp.hpp @@ -192,6 +192,7 @@ struct NoVec extern volatile bool USE_SSE2; extern volatile bool USE_SSE4_2; extern volatile bool USE_AVX; +extern volatile bool USE_AVX2; enum { BLOCK_SIZE = 1024 }; diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index d9a20873f6..11bbab3a25 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -82,6 +82,22 @@ pop ebx } } + static void __cpuidex(int* cpuid_data, int, int) + { + __asm + { + push edi + mov edi, cpuid_data + mov eax, 7 + mov ecx, 0 + cpuid + mov [edi], eax + mov [edi + 4], ebx + mov [edi + 8], ecx + mov [edi + 12], edx + pop edi + } + } #endif #endif @@ -203,7 +219,7 @@ struct HWFeatures enum { MAX_FEATURE = CV_HARDWARE_MAX_FEATURE }; HWFeatures(void) - { + { memset( have, 0, sizeof(have) ); x86_family = 0; } @@ -251,6 +267,40 @@ struct HWFeatures f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0; f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0; f.have[CV_CPU_AVX] = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX + + // make the second call to the cpuid command in order to get + // information about extended features like AVX2 + #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64) + __cpuidex(cpuid_data, 7, 0); + #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__) + #ifdef __x86_64__ + asm __volatile__ + ( + "movl $7, %%eax\n\t" + "movl $0, %%ecx\n\t" + "cpuid\n\t" + :[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3]) + : + : "cc" + ); + #else + asm volatile + ( + "pushl %%eax\n\t" + "pushl %%edx\n\t" + "movl $7,%%eax\n\t" + "movl $0,%%ecx\n\t" + "cpuid\n\t" + "popl %%edx\n\t" + "popl %%eax\n\t" + : "=b"(cpuid_data[1]), "=c"(cpuid_data[2]) + : + : "cc" + ); + #endif + #endif + f.have[CV_CPU_AVX2] = (cpuid_data[1] & (1<<5)) != 0; + } return f; @@ -290,6 +340,7 @@ IPPInitializer ippInitializer; volatile bool USE_SSE2 = featuresEnabled.have[CV_CPU_SSE2]; volatile bool USE_SSE4_2 = featuresEnabled.have[CV_CPU_SSE4_2]; volatile bool USE_AVX = featuresEnabled.have[CV_CPU_AVX]; +volatile bool USE_AVX2 = featuresEnabled.have[CV_CPU_AVX2]; void setUseOptimized( bool flag ) { diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp index 7745c86c5c..53b62e74d7 100644 --- a/modules/ts/src/ts_func.cpp +++ b/modules/ts/src/ts_func.cpp @@ -3019,6 +3019,9 @@ void printVersionInfo(bool useStdOut) #if CV_AVX if (checkHardwareSupport(CV_CPU_AVX)) cpu_features += " avx"; #endif +#if CV_AVX2 + if (checkHardwareSupport(CV_CPU_AVX2)) cpu_features += " avx2"; +#endif #if CV_NEON cpu_features += " neon"; // NEON is currently not checked at runtime #endif