From 8d48632ebef60e7c4d92b5c9d6549f8e1623010a Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Mon, 12 Jan 2015 10:59:28 +0300
Subject: [PATCH] avx2

---
 CMakeLists.txt                            |  1 +
 cmake/OpenCVCompilerOptions.cmake         |  3 +
 modules/core/include/opencv2/core/cvdef.h | 11 +++-
 modules/core/src/convert.cpp              | 72 +++++++++++------------
 modules/core/src/precomp.hpp              |  1 +
 modules/core/src/system.cpp               | 53 ++++++++++++++++-
 modules/ts/src/ts_func.cpp                |  3 +
 7 files changed, 105 insertions(+), 39 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 75fcf9659b..7b5648efd4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -221,6 +221,7 @@ OCV_OPTION(ENABLE_SSSE3               "Enable SSSE3 instructions"
 OCV_OPTION(ENABLE_SSE41               "Enable SSE4.1 instructions"                               OFF  IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_SSE42               "Enable SSE4.2 instructions"                               OFF  IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_AVX                 "Enable AVX instructions"                                  OFF  IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
+OCV_OPTION(ENABLE_AVX2                "Enable AVX2 instructions"                                 OFF  IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_NEON                "Enable NEON instructions"                                 OFF  IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) )
 OCV_OPTION(ENABLE_VFPV3               "Enable VFPv3-D32 instructions"                            OFF  IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) )
 OCV_OPTION(ENABLE_NOISY_WARNINGS      "Show all warnings even if they are too noisy"             OFF )
diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake
index 2f9068c60d..831026fb50 100644
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@@ -140,6 +140,9 @@ if(CMAKE_COMPILER_IS_GNUCXX)
     if(ENABLE_AVX)
       add_extra_compiler_option(-mavx)
     endif()
+    if(ENABLE_AVX2)
+      add_extra_compiler_option(-mavx2)
+    endif()
 
     # GCC depresses SSEx instructions when -mavx is used. Instead, it generates new AVX instructions or AVX equivalence for all SSEx instructions when needed.
     if(NOT OPENCV_EXTRA_CXX_FLAGS MATCHES "-mavx")
diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h
index 06894d7a5d..c52cb021cb 100644
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -114,7 +114,8 @@
 #define CV_CPU_SSE4_2  7
 #define CV_CPU_POPCNT  8
 #define CV_CPU_AVX    10
-#define CV_CPU_NEON   11
+#define CV_CPU_AVX2   11
+#define CV_CPU_NEON   12
 // when adding to this list remember to update the enum in core/utility.cpp
 #define CV_HARDWARE_MAX_FEATURE 255
 
@@ -141,7 +142,7 @@
 #    include <nmmintrin.h>
 #    define CV_SSE4_2 1
 #  endif
-#  if defined __AVX__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219)
+#  if defined __AVX__ || defined __AVX2__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219)
 // MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX
 // See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32
 #    include <immintrin.h>
@@ -150,6 +151,9 @@
 #      define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK)
 #    else
 #      define __xgetbv() 0
+#    ifdef __AVX2__
+#      define CV_AVX2 1
+#    endif
 #    endif
 #  endif
 #endif
@@ -187,6 +191,9 @@
 #ifndef CV_AVX
 #  define CV_AVX 0
 #endif
+#ifndef CV_AVX2
+#  define CV_AVX2 0
+#endif
 #ifndef CV_NEON
 #  define CV_NEON 0
 #endif
diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp
index 829b984c9f..55f08f1bde 100644
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -2294,26 +2294,44 @@ cvtScale_<short, int, float>( const short* src, size_t sstep,
     {
         int x = 0;
 
-         #if CV_SSE2
-            if(USE_SSE2)//~5X
+        #if CV_AVX2
+        if (USE_AVX2)
+        {
+            __m256 scale256 = _mm256_set1_ps (scale);
+            __m256 shift256 = _mm256_set1_ps (shift);
+            __m256i zero = _mm256_setzero_si256();
+            for ( ; x <= size.width - 16; x += 16)
             {
-                __m128 scale128 = _mm_set1_ps (scale);
-                __m128 shift128 = _mm_set1_ps (shift);
-                for(; x <= size.width - 8; x += 8 )
-                {
-                    __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x));
-                    __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4));
-                    __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
-                    __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16));
-                    rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
-                    rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
-                    r0 = _mm_cvtps_epi32(rf0);
-                    r1 = _mm_cvtps_epi32(rf1);
-
-                    _mm_storeu_si128((__m128i*)(dst + x), r0);
-                    _mm_storeu_si128((__m128i*)(dst + x + 4), r1);
-                }
+                __m256i v_src = _mm256_loadu_si256((__m256i const *)(src + x));
+                __m256i v_src_lo = _mm256_unpacklo_epi16(v_src, zero);
+                __m256i v_src_hi = _mm256_unpackhi_epi16(v_src, zero);
+                __m256 v_dst0 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (v_src_lo), scale256), shift256);
+                __m256 v_dst1 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (v_src_hi), scale256), shift256);
+                _mm256_storeu_si256 ((__m256i *)(dst + x), _mm256_cvtps_epi32(v_dst0));
+                _mm256_storeu_si256 ((__m256i *)(dst + x + 8), _mm256_cvtps_epi32(v_dst1));
+            }
+        }
+        #endif
+        #if CV_SSE2
+        if (USE_SSE2)//~5X
+        {
+            __m128 scale128 = _mm_set1_ps (scale);
+            __m128 shift128 = _mm_set1_ps (shift);
+            for(; x <= size.width - 8; x += 8 )
+            {
+                __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x));
+                __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4));
+                __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
+                __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16));
+                rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
+                rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
+                r0 = _mm_cvtps_epi32(rf0);
+                r1 = _mm_cvtps_epi32(rf1);
+
+                _mm_storeu_si128((__m128i*)(dst + x), r0);
+                _mm_storeu_si128((__m128i*)(dst + x + 4), r1);
             }
+        }
         #elif CV_NEON
         float32x4_t v_shift = vdupq_n_f32(shift);
         for(; x <= size.width - 8; x += 8 )
@@ -2330,24 +2348,6 @@ cvtScale_<short, int, float>( const short* src, size_t sstep,
         }
         #endif
 
-        //We will wait Haswell
-        /*
-        #if CV_AVX
-            if(USE_AVX)//2X - bad variant
-            {
-                ////TODO:AVX implementation (optimization?) required
-                __m256 scale256 = _mm256_set1_ps (scale);
-                __m256 shift256 = _mm256_set1_ps (shift);
-                for(; x <= size.width - 8; x += 8 )
-                {
-                    __m256i buf = _mm256_set_epi32((int)(*(src+x+7)),(int)(*(src+x+6)),(int)(*(src+x+5)),(int)(*(src+x+4)),(int)(*(src+x+3)),(int)(*(src+x+2)),(int)(*(src+x+1)),(int)(*(src+x)));
-                    __m256 r0 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (buf), scale256), shift256);
-                    __m256i res = _mm256_cvtps_epi32(r0);
-                    _mm256_storeu_si256 ((__m256i*)(dst+x), res);
-                }
-            }
-        #endif*/
-
         for(; x < size.width; x++ )
             dst[x] = saturate_cast<int>(src[x]*scale + shift);
     }
diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp
index ef154400e2..0f85cc5568 100644
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -192,6 +192,7 @@ struct NoVec
 extern volatile bool USE_SSE2;
 extern volatile bool USE_SSE4_2;
 extern volatile bool USE_AVX;
+extern volatile bool USE_AVX2;
 
 enum { BLOCK_SIZE = 1024 };
 
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index d9a20873f6..11bbab3a25 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -82,6 +82,22 @@
             pop ebx
         }
     }
+    static void __cpuidex(int* cpuid_data, int, int)
+    {
+        __asm
+        {
+            push edi
+            mov edi, cpuid_data
+            mov eax, 7
+            mov ecx, 0
+            cpuid
+            mov [edi], eax
+            mov [edi + 4], ebx
+            mov [edi + 8], ecx
+            mov [edi + 12], edx
+            pop edi
+        }
+    }
   #endif
 #endif
 
@@ -203,7 +219,7 @@ struct HWFeatures
     enum { MAX_FEATURE = CV_HARDWARE_MAX_FEATURE };
 
     HWFeatures(void)
-     {
+    {
         memset( have, 0, sizeof(have) );
         x86_family = 0;
     }
@@ -251,6 +267,40 @@ struct HWFeatures
             f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0;
             f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
             f.have[CV_CPU_AVX]    = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX
+
+            // make the second call to the cpuid command in order to get
+            // information about extended features like AVX2
+        #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+            __cpuidex(cpuid_data, 7, 0);
+        #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+            #ifdef __x86_64__
+            asm __volatile__
+            (
+             "movl $7, %%eax\n\t"
+             "movl $0, %%ecx\n\t"
+             "cpuid\n\t"
+             :[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3])
+             :
+             : "cc"
+            );
+            #else
+            asm volatile
+            (
+             "pushl %%eax\n\t"
+             "pushl %%edx\n\t"
+             "movl $7,%%eax\n\t"
+             "movl $0,%%ecx\n\t"
+             "cpuid\n\t"
+             "popl %%edx\n\t"
+             "popl %%eax\n\t"
+             : "=b"(cpuid_data[1]), "=c"(cpuid_data[2])
+             :
+             : "cc"
+            );
+            #endif
+        #endif
+            f.have[CV_CPU_AVX2]   = (cpuid_data[1] & (1<<5)) != 0;
+
         }
 
         return f;
@@ -290,6 +340,7 @@ IPPInitializer ippInitializer;
 volatile bool USE_SSE2 = featuresEnabled.have[CV_CPU_SSE2];
 volatile bool USE_SSE4_2 = featuresEnabled.have[CV_CPU_SSE4_2];
 volatile bool USE_AVX = featuresEnabled.have[CV_CPU_AVX];
+volatile bool USE_AVX2 = featuresEnabled.have[CV_CPU_AVX2];
 
 void setUseOptimized( bool flag )
 {
diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp
index 7745c86c5c..53b62e74d7 100644
--- a/modules/ts/src/ts_func.cpp
+++ b/modules/ts/src/ts_func.cpp
@@ -3019,6 +3019,9 @@ void printVersionInfo(bool useStdOut)
 #if CV_AVX
     if (checkHardwareSupport(CV_CPU_AVX)) cpu_features += " avx";
 #endif
+#if CV_AVX2
+    if (checkHardwareSupport(CV_CPU_AVX2)) cpu_features += " avx2";
+#endif
 #if CV_NEON
     cpu_features += " neon"; // NEON is currently not checked at runtime
 #endif