From 11a09ef5ccda32b166b7384949f485ec053e3ba5 Mon Sep 17 00:00:00 2001 From: Richard Yoo Date: Fri, 6 Jun 2014 13:37:13 -0700 Subject: [PATCH] Changes to support Intel AVX/AVX2 in cvResize(). --- CMakeLists.txt | 1 + cmake/OpenCVCompilerOptions.cmake | 14 +- modules/core/include/opencv2/core/core_c.h | 1 + .../core/include/opencv2/core/internal.hpp | 7 + modules/core/src/system.cpp | 35 + modules/imgproc/src/imgwarp.cpp | 1236 +++++++++++++---- modules/ts/src/ts_func.cpp | 3 + 7 files changed, 991 insertions(+), 306 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b610ecf971..50e6cd0230 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -217,6 +217,7 @@ OCV_OPTION(ENABLE_SSSE3 "Enable SSSE3 instructions" OCV_OPTION(ENABLE_SSE41 "Enable SSE4.1 instructions" OFF IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSE42 "Enable SSE4.2 instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_AVX "Enable AVX instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) +OCV_OPTION(ENABLE_AVX2 "Enable AVX2 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_NEON "Enable NEON instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND ARM ) OCV_OPTION(ENABLE_VFPV3 "Enable VFPv3-D32 instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND ARM ) OCV_OPTION(ENABLE_NOISY_WARNINGS "Show all warnings even if they are too noisy" OFF ) diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake index d525609d18..f28aaeed50 100644 --- a/cmake/OpenCVCompilerOptions.cmake +++ b/cmake/OpenCVCompilerOptions.cmake @@ -143,8 +143,12 @@ if(CMAKE_COMPILER_IS_GNUCXX) add_extra_compiler_option(-mavx) endif() + if(ENABLE_AVX2) + add_extra_compiler_option(-mavx2) + endif() + # GCC depresses SSEx instructions when -mavx is used. Instead, it generates new AVX instructions or AVX equivalence for all SSEx instructions when needed. - if(NOT OPENCV_EXTRA_CXX_FLAGS MATCHES "-mavx") + if(NOT OPENCV_EXTRA_CXX_FLAGS MATCHES "-m(avx|avx2)") if(ENABLE_SSE3) add_extra_compiler_option(-msse3) endif() @@ -165,7 +169,7 @@ if(CMAKE_COMPILER_IS_GNUCXX) if(X86 OR X86_64) if(NOT APPLE AND CMAKE_SIZEOF_VOID_P EQUAL 4) - if(OPENCV_EXTRA_CXX_FLAGS MATCHES "-m(sse2|avx)") + if(OPENCV_EXTRA_CXX_FLAGS MATCHES "-m(sse2|avx|avx2)") add_extra_compiler_option(-mfpmath=sse)# !! important - be on the same wave with x64 compilers else() add_extra_compiler_option(-mfpmath=387) @@ -220,6 +224,10 @@ if(MSVC) set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:AVX") endif() + if(ENABLE_AVX2 AND NOT MSVC_VERSION LESS 1800) + set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:AVX2") + endif() + if(ENABLE_SSE4_1 AND CV_ICC AND NOT OPENCV_EXTRA_FLAGS MATCHES "/arch:") set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:SSE4.1") endif() @@ -238,7 +246,7 @@ if(MSVC) endif() endif() - if(ENABLE_SSE OR ENABLE_SSE2 OR ENABLE_SSE3 OR ENABLE_SSE4_1 OR ENABLE_AVX) + if(ENABLE_SSE OR ENABLE_SSE2 OR ENABLE_SSE3 OR ENABLE_SSE4_1 OR ENABLE_AVX OR ENABLE_AVX2) set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /Oi") endif() diff --git a/modules/core/include/opencv2/core/core_c.h b/modules/core/include/opencv2/core/core_c.h index 38abfc409b..b108e3498e 100644 --- a/modules/core/include/opencv2/core/core_c.h +++ b/modules/core/include/opencv2/core/core_c.h @@ -1706,6 +1706,7 @@ CVAPI(double) cvGetTickFrequency( void ); #define CV_CPU_SSE4_2 7 #define CV_CPU_POPCNT 8 #define CV_CPU_AVX 10 +#define CV_CPU_AVX2 11 #define CV_HARDWARE_MAX_FEATURE 255 CVAPI(int) cvCheckHardwareSupport(int feature); diff --git a/modules/core/include/opencv2/core/internal.hpp b/modules/core/include/opencv2/core/internal.hpp index 6c9d3d2f13..9959c169ac 100644 --- a/modules/core/include/opencv2/core/internal.hpp +++ b/modules/core/include/opencv2/core/internal.hpp @@ -141,6 +141,10 @@ CV_INLINE IppiSize ippiSize(const cv::Size & _size) # define __xgetbv() 0 # endif # endif +# if defined __AVX2__ +# include +# define CV_AVX2 1 +# endif #endif @@ -176,6 +180,9 @@ CV_INLINE IppiSize ippiSize(const cv::Size & _size) #ifndef CV_AVX # define CV_AVX 0 #endif +#ifndef CV_AVX2 +# define CV_AVX2 0 +#endif #ifndef CV_NEON # define CV_NEON 0 #endif diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index 68aff531f1..40d64ffe1b 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -253,6 +253,41 @@ struct HWFeatures f.have[CV_CPU_AVX] = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX } +#if CV_AVX2 + #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64) + __cpuidex(cpuid_data, 7, 0); + #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__) + #ifdef __x86_64__ + asm __volatile__ + ( + "movl $7, %%eax\n\t" + "movl $0, %%ecx\n\t" + "cpuid\n\t" + :[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3]) + : + : "cc" + ); + #else + asm volatile + ( + "pushl %%ebx\n\t" + "movl $7,%%eax\n\t" + "movl $0,%%ecx\n\t" + "cpuid\n\t" + "popl %%ebx\n\t" + : "=a"(cpuid_data[0]), "=b"(cpuid_data[1]), "=c"(cpuid_data[2]), "=d"(cpuid_data[3]) + : + : "cc" + ); + #endif + #endif + + if( f.x86_family >= 6 ) + { + f.have[CV_CPU_AVX2] = (cpuid_data[1] & (1<<5)) != 0; + } +#endif + return f; } diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index dcd718fb68..88b278710d 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -54,6 +54,10 @@ static IppStatus sts = ippInit(); #endif +#ifdef _MSC_VER +# pragma warning(disable:4752) // Disable warning for mixing SSE and AVX +#endif + namespace cv { @@ -451,350 +455,741 @@ struct HResizeNoVec #if CV_SSE2 -struct VResizeLinearVec_32s8u +static int VResizeLinearVec_32s8u_sse2(const uchar** _src, uchar* dst, const uchar* _beta, int width) { - int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const - { - if( !checkHardwareSupport(CV_CPU_SSE2) ) - return 0; + const int** src = (const int**)_src; + const short* beta = (const short*)_beta; + const int *S0 = src[0], *S1 = src[1]; + int x = 0; + __m128i b0 = _mm_set1_epi16(beta[0]), b1 = _mm_set1_epi16(beta[1]); + __m128i delta = _mm_set1_epi16(2); + + if( (((size_t)S0|(size_t)S1)&15) == 0 ) + for( ; x <= width - 16; x += 16 ) + { + __m128i x0, x1, x2, y0, y1, y2; + x0 = _mm_load_si128((const __m128i*)(S0 + x)); + x1 = _mm_load_si128((const __m128i*)(S0 + x + 4)); + y0 = _mm_load_si128((const __m128i*)(S1 + x)); + y1 = _mm_load_si128((const __m128i*)(S1 + x + 4)); + x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4)); + y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4)); + + x1 = _mm_load_si128((const __m128i*)(S0 + x + 8)); + x2 = _mm_load_si128((const __m128i*)(S0 + x + 12)); + y1 = _mm_load_si128((const __m128i*)(S1 + x + 8)); + y2 = _mm_load_si128((const __m128i*)(S1 + x + 12)); + x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4)); + y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4)); + + x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 )); + x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 )); - const int** src = (const int**)_src; - const short* beta = (const short*)_beta; - const int *S0 = src[0], *S1 = src[1]; - int x = 0; - __m128i b0 = _mm_set1_epi16(beta[0]), b1 = _mm_set1_epi16(beta[1]); - __m128i delta = _mm_set1_epi16(2); + x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2); + x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2); + _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1)); + } + else + for( ; x <= width - 16; x += 16 ) + { + __m128i x0, x1, x2, y0, y1, y2; + x0 = _mm_loadu_si128((const __m128i*)(S0 + x)); + x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4)); + y0 = _mm_loadu_si128((const __m128i*)(S1 + x)); + y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4)); + x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4)); + y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4)); + + x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 8)); + x2 = _mm_loadu_si128((const __m128i*)(S0 + x + 12)); + y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 8)); + y2 = _mm_loadu_si128((const __m128i*)(S1 + x + 12)); + x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4)); + y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4)); + + x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 )); + x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 )); - if( (((size_t)S0|(size_t)S1)&15) == 0 ) - for( ; x <= width - 16; x += 16 ) - { - __m128i x0, x1, x2, y0, y1, y2; - x0 = _mm_load_si128((const __m128i*)(S0 + x)); - x1 = _mm_load_si128((const __m128i*)(S0 + x + 4)); - y0 = _mm_load_si128((const __m128i*)(S1 + x)); - y1 = _mm_load_si128((const __m128i*)(S1 + x + 4)); - x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4)); - y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4)); - - x1 = _mm_load_si128((const __m128i*)(S0 + x + 8)); - x2 = _mm_load_si128((const __m128i*)(S0 + x + 12)); - y1 = _mm_load_si128((const __m128i*)(S1 + x + 8)); - y2 = _mm_load_si128((const __m128i*)(S1 + x + 12)); - x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4)); - y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4)); - - x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 )); - x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 )); - - x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2); - x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2); - _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1)); - } - else - for( ; x <= width - 16; x += 16 ) - { - __m128i x0, x1, x2, y0, y1, y2; - x0 = _mm_loadu_si128((const __m128i*)(S0 + x)); - x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4)); - y0 = _mm_loadu_si128((const __m128i*)(S1 + x)); - y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4)); - x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4)); - y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4)); - - x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 8)); - x2 = _mm_loadu_si128((const __m128i*)(S0 + x + 12)); - y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 8)); - y2 = _mm_loadu_si128((const __m128i*)(S1 + x + 12)); - x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4)); - y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4)); - - x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 )); - x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 )); - - x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2); - x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2); - _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1)); - } + x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2); + x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2); + _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1)); + } - for( ; x < width - 4; x += 4 ) + for( ; x < width - 4; x += 4 ) + { + __m128i x0, y0; + x0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S0 + x)), 4); + y0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S1 + x)), 4); + x0 = _mm_packs_epi32(x0, x0); + y0 = _mm_packs_epi32(y0, y0); + x0 = _mm_adds_epi16(_mm_mulhi_epi16(x0, b0), _mm_mulhi_epi16(y0, b1)); + x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2); + x0 = _mm_packus_epi16(x0, x0); + *(int*)(dst + x) = _mm_cvtsi128_si32(x0); + } + + return x; +} + +#if CV_AVX2 +int VResizeLinearVec_32s8u_avx2(const uchar** _src, uchar* dst, const uchar* _beta, int width ) +{ + const int** src = (const int**)_src; + const short* beta = (const short*)_beta; + const int *S0 = src[0], *S1 = src[1]; + int x = 0; + __m256i b0 = _mm256_set1_epi16(beta[0]), b1 = _mm256_set1_epi16(beta[1]); + __m256i delta = _mm256_set1_epi16(2); + const int index[8] = { 0, 4, 1, 5, 2, 6, 3, 7 }; + __m256i shuffle = _mm256_load_si256((const __m256i*)index); + + if( (((size_t)S0|(size_t)S1)&31) == 0 ) + for( ; x <= width - 32; x += 32 ) { - __m128i x0, y0; - x0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S0 + x)), 4); - y0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S1 + x)), 4); - x0 = _mm_packs_epi32(x0, x0); - y0 = _mm_packs_epi32(y0, y0); - x0 = _mm_adds_epi16(_mm_mulhi_epi16(x0, b0), _mm_mulhi_epi16(y0, b1)); - x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2); - x0 = _mm_packus_epi16(x0, x0); - *(int*)(dst + x) = _mm_cvtsi128_si32(x0); + __m256i x0, x1, x2, y0, y1, y2; + x0 = _mm256_load_si256((const __m256i*)(S0 + x)); + x1 = _mm256_load_si256((const __m256i*)(S0 + x + 8)); + y0 = _mm256_load_si256((const __m256i*)(S1 + x)); + y1 = _mm256_load_si256((const __m256i*)(S1 + x + 8)); + x0 = _mm256_packs_epi32(_mm256_srai_epi32(x0, 4), _mm256_srai_epi32(x1, 4)); + y0 = _mm256_packs_epi32(_mm256_srai_epi32(y0, 4), _mm256_srai_epi32(y1, 4)); + + x1 = _mm256_load_si256((const __m256i*)(S0 + x + 16)); + x2 = _mm256_load_si256((const __m256i*)(S0 + x + 24)); + y1 = _mm256_load_si256((const __m256i*)(S1 + x + 16)); + y2 = _mm256_load_si256((const __m256i*)(S1 + x + 24)); + x1 = _mm256_packs_epi32(_mm256_srai_epi32(x1, 4), _mm256_srai_epi32(x2, 4)); + y1 = _mm256_packs_epi32(_mm256_srai_epi32(y1, 4), _mm256_srai_epi32(y2, 4)); + + x0 = _mm256_adds_epi16(_mm256_mulhi_epi16(x0, b0), _mm256_mulhi_epi16(y0, b1)); + x1 = _mm256_adds_epi16(_mm256_mulhi_epi16(x1, b0), _mm256_mulhi_epi16(y1, b1)); + + x0 = _mm256_srai_epi16(_mm256_adds_epi16(x0, delta), 2); + x1 = _mm256_srai_epi16(_mm256_adds_epi16(x1, delta), 2); + x0 = _mm256_packus_epi16(x0, x1); + x0 = _mm256_permutevar8x32_epi32(x0, shuffle); + _mm256_storeu_si256( (__m256i*)(dst + x), x0); + } + else + for( ; x <= width - 32; x += 32 ) + { + __m256i x0, x1, x2, y0, y1, y2; + x0 = _mm256_loadu_si256((const __m256i*)(S0 + x)); + x1 = _mm256_loadu_si256((const __m256i*)(S0 + x + 8)); + y0 = _mm256_loadu_si256((const __m256i*)(S1 + x)); + y1 = _mm256_loadu_si256((const __m256i*)(S1 + x + 8)); + x0 = _mm256_packs_epi32(_mm256_srai_epi32(x0, 4), _mm256_srai_epi32(x1, 4)); + y0 = _mm256_packs_epi32(_mm256_srai_epi32(y0, 4), _mm256_srai_epi32(y1, 4)); + + x1 = _mm256_loadu_si256((const __m256i*)(S0 + x + 16)); + x2 = _mm256_loadu_si256((const __m256i*)(S0 + x + 24)); + y1 = _mm256_loadu_si256((const __m256i*)(S1 + x + 16)); + y2 = _mm256_loadu_si256((const __m256i*)(S1 + x + 24)); + x1 = _mm256_packs_epi32(_mm256_srai_epi32(x1, 4), _mm256_srai_epi32(x2, 4)); + y1 = _mm256_packs_epi32(_mm256_srai_epi32(y1, 4), _mm256_srai_epi32(y2, 4)); + + x0 = _mm256_adds_epi16(_mm256_mulhi_epi16(x0, b0), _mm256_mulhi_epi16(y0, b1)); + x1 = _mm256_adds_epi16(_mm256_mulhi_epi16(x1, b0), _mm256_mulhi_epi16(y1, b1)); + + x0 = _mm256_srai_epi16(_mm256_adds_epi16(x0, delta), 2); + x1 = _mm256_srai_epi16(_mm256_adds_epi16(x1, delta), 2); + x0 = _mm256_packus_epi16(x0, x1); + x0 = _mm256_permutevar8x32_epi32(x0, shuffle); + _mm256_storeu_si256( (__m256i*)(dst + x), x0); } - return x; + for( ; x < width - 8; x += 8 ) + { + __m256i x0, y0; + x0 = _mm256_srai_epi32(_mm256_loadu_si256((const __m256i*)(S0 + x)), 4); + y0 = _mm256_srai_epi32(_mm256_loadu_si256((const __m256i*)(S1 + x)), 4); + x0 = _mm256_packs_epi32(x0, x0); + y0 = _mm256_packs_epi32(y0, y0); + x0 = _mm256_adds_epi16(_mm256_mulhi_epi16(x0, b0), _mm256_mulhi_epi16(y0, b1)); + x0 = _mm256_srai_epi16(_mm256_adds_epi16(x0, delta), 2); + x0 = _mm256_packus_epi16(x0, x0); + *(int*)(dst + x) = _mm_cvtsi128_si32(_mm256_extracti128_si256(x0, 0)); + *(int*)(dst + x + 4) = _mm_cvtsi128_si32(_mm256_extracti128_si256(x0, 1)); } -}; + return x; +} +#endif -template struct VResizeLinearVec_32f16 +struct VResizeLinearVec_32s8u { - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const { - if( !checkHardwareSupport(CV_CPU_SSE2) ) - return 0; +#if CV_AVX2 + if( checkHardwareSupport(CV_CPU_AVX2) ) + return VResizeLinearVec_32s8u_avx2(_src, dst, _beta, width); +#endif + if( checkHardwareSupport(CV_CPU_SSE2) ) + return VResizeLinearVec_32s8u_sse2(_src, dst, _beta, width); - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1]; - ushort* dst = (ushort*)_dst; - int x = 0; + return 0; + } +}; - __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]); - __m128i preshift = _mm_set1_epi32(shiftval); - __m128i postshift = _mm_set1_epi16((short)shiftval); +template +int VResizeLinearVec_32f16_sse2(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) +{ + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1]; + ushort* dst = (ushort*)_dst; + int x = 0; + + __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]); + __m128i preshift = _mm_set1_epi32(shiftval); + __m128i postshift = _mm_set1_epi16((short)shiftval); + + if( (((size_t)S0|(size_t)S1)&15) == 0 ) + for( ; x <= width - 16; x += 16 ) + { + __m128 x0, x1, y0, y1; + __m128i t0, t1, t2; + x0 = _mm_load_ps(S0 + x); + x1 = _mm_load_ps(S0 + x + 4); + y0 = _mm_load_ps(S1 + x); + y1 = _mm_load_ps(S1 + x + 4); - if( (((size_t)S0|(size_t)S1)&15) == 0 ) - for( ; x <= width - 16; x += 16 ) - { - __m128 x0, x1, y0, y1; - __m128i t0, t1, t2; - x0 = _mm_load_ps(S0 + x); - x1 = _mm_load_ps(S0 + x + 4); - y0 = _mm_load_ps(S1 + x); - y1 = _mm_load_ps(S1 + x + 4); - - x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); - x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); - t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); - t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); - t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift); - - x0 = _mm_load_ps(S0 + x + 8); - x1 = _mm_load_ps(S0 + x + 12); - y0 = _mm_load_ps(S1 + x + 8); - y1 = _mm_load_ps(S1 + x + 12); - - x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); - x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); - t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); - t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); - t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift); - - _mm_storeu_si128( (__m128i*)(dst + x), t0); - _mm_storeu_si128( (__m128i*)(dst + x + 8), t1); - } - else - for( ; x <= width - 16; x += 16 ) - { - __m128 x0, x1, y0, y1; - __m128i t0, t1, t2; - x0 = _mm_loadu_ps(S0 + x); - x1 = _mm_loadu_ps(S0 + x + 4); - y0 = _mm_loadu_ps(S1 + x); - y1 = _mm_loadu_ps(S1 + x + 4); - - x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); - x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); - t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); - t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); - t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift); - - x0 = _mm_loadu_ps(S0 + x + 8); - x1 = _mm_loadu_ps(S0 + x + 12); - y0 = _mm_loadu_ps(S1 + x + 8); - y1 = _mm_loadu_ps(S1 + x + 12); - - x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); - x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); - t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); - t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); - t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift); - - _mm_storeu_si128( (__m128i*)(dst + x), t0); - _mm_storeu_si128( (__m128i*)(dst + x + 8), t1); - } + x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); + x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); + t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); + t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); + t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift); + + x0 = _mm_load_ps(S0 + x + 8); + x1 = _mm_load_ps(S0 + x + 12); + y0 = _mm_load_ps(S1 + x + 8); + y1 = _mm_load_ps(S1 + x + 12); - for( ; x < width - 4; x += 4 ) + x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); + x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); + t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); + t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); + t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift); + + _mm_storeu_si128( (__m128i*)(dst + x), t0); + _mm_storeu_si128( (__m128i*)(dst + x + 8), t1); + } + else + for( ; x <= width - 16; x += 16 ) { - __m128 x0, y0; - __m128i t0; + __m128 x0, x1, y0, y1; + __m128i t0, t1, t2; x0 = _mm_loadu_ps(S0 + x); + x1 = _mm_loadu_ps(S0 + x + 4); y0 = _mm_loadu_ps(S1 + x); + y1 = _mm_loadu_ps(S1 + x + 4); x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); + x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); - t0 = _mm_add_epi16(_mm_packs_epi32(t0, t0), postshift); - _mm_storel_epi64( (__m128i*)(dst + x), t0); + t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); + t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift); + + x0 = _mm_loadu_ps(S0 + x + 8); + x1 = _mm_loadu_ps(S0 + x + 12); + y0 = _mm_loadu_ps(S1 + x + 8); + y1 = _mm_loadu_ps(S1 + x + 12); + + x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); + x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); + t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); + t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); + t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift); + + _mm_storeu_si128( (__m128i*)(dst + x), t0); + _mm_storeu_si128( (__m128i*)(dst + x + 8), t1); } - return x; + for( ; x < width - 4; x += 4 ) + { + __m128 x0, y0; + __m128i t0; + x0 = _mm_loadu_ps(S0 + x); + y0 = _mm_loadu_ps(S1 + x); + + x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); + t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); + t0 = _mm_add_epi16(_mm_packs_epi32(t0, t0), postshift); + _mm_storel_epi64( (__m128i*)(dst + x), t0); + } + + return x; +} + +#if CV_AVX2 +template +int VResizeLinearVec_32f16_avx2(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) +{ + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1]; + ushort* dst = (ushort*)_dst; + int x = 0; + + __m256 b0 = _mm256_set1_ps(beta[0]), b1 = _mm256_set1_ps(beta[1]); + __m256i preshift = _mm256_set1_epi32(shiftval); + __m256i postshift = _mm256_set1_epi16((short)shiftval); + + if( (((size_t)S0|(size_t)S1)&31) == 0 ) + for( ; x <= width - 32; x += 32 ) + { + __m256 x0, x1, y0, y1; + __m256i t0, t1, t2; + x0 = _mm256_load_ps(S0 + x); + x1 = _mm256_load_ps(S0 + x + 8); + y0 = _mm256_load_ps(S1 + x); + y1 = _mm256_load_ps(S1 + x + 8); + + x0 = _mm256_add_ps(_mm256_mul_ps(x0, b0), _mm256_mul_ps(y0, b1)); + x1 = _mm256_add_ps(_mm256_mul_ps(x1, b0), _mm256_mul_ps(y1, b1)); + t0 = _mm256_add_epi32(_mm256_cvtps_epi32(x0), preshift); + t2 = _mm256_add_epi32(_mm256_cvtps_epi32(x1), preshift); + t0 = _mm256_add_epi16(_mm256_packs_epi32(t0, t2), postshift); + + x0 = _mm256_load_ps(S0 + x + 16); + x1 = _mm256_load_ps(S0 + x + 24); + y0 = _mm256_load_ps(S1 + x + 16); + y1 = _mm256_load_ps(S1 + x + 24); + + x0 = _mm256_add_ps(_mm256_mul_ps(x0, b0), _mm256_mul_ps(y0, b1)); + x1 = _mm256_add_ps(_mm256_mul_ps(x1, b0), _mm256_mul_ps(y1, b1)); + t1 = _mm256_add_epi32(_mm256_cvtps_epi32(x0), preshift); + t2 = _mm256_add_epi32(_mm256_cvtps_epi32(x1), preshift); + t1 = _mm256_add_epi16(_mm256_packs_epi32(t1, t2), postshift); + + _mm256_storeu_si256( (__m256i*)(dst + x), t0); + _mm256_storeu_si256( (__m256i*)(dst + x + 16), t1); + } + else + for( ; x <= width - 32; x += 32 ) + { + __m256 x0, x1, y0, y1; + __m256i t0, t1, t2; + x0 = _mm256_loadu_ps(S0 + x); + x1 = _mm256_loadu_ps(S0 + x + 8); + y0 = _mm256_loadu_ps(S1 + x); + y1 = _mm256_loadu_ps(S1 + x + 8); + + x0 = _mm256_add_ps(_mm256_mul_ps(x0, b0), _mm256_mul_ps(y0, b1)); + x1 = _mm256_add_ps(_mm256_mul_ps(x1, b0), _mm256_mul_ps(y1, b1)); + t0 = _mm256_add_epi32(_mm256_cvtps_epi32(x0), preshift); + t2 = _mm256_add_epi32(_mm256_cvtps_epi32(x1), preshift); + t0 = _mm256_add_epi16(_mm256_packs_epi32(t0, t2), postshift); + + x0 = _mm256_loadu_ps(S0 + x + 16); + x1 = _mm256_loadu_ps(S0 + x + 24); + y0 = _mm256_loadu_ps(S1 + x + 16); + y1 = _mm256_loadu_ps(S1 + x + 24); + + x0 = _mm256_add_ps(_mm256_mul_ps(x0, b0), _mm256_mul_ps(y0, b1)); + x1 = _mm256_add_ps(_mm256_mul_ps(x1, b0), _mm256_mul_ps(y1, b1)); + t1 = _mm256_add_epi32(_mm256_cvtps_epi32(x0), preshift); + t2 = _mm256_add_epi32(_mm256_cvtps_epi32(x1), preshift); + t1 = _mm256_add_epi16(_mm256_packs_epi32(t1, t2), postshift); + + _mm256_storeu_si256( (__m256i*)(dst + x), t0); + _mm256_storeu_si256( (__m256i*)(dst + x + 16), t1); + } + + for( ; x < width - 8; x += 8 ) + { + __m256 x0, y0; + __m256i t0; + x0 = _mm256_loadu_ps(S0 + x); + y0 = _mm256_loadu_ps(S1 + x); + + x0 = _mm256_add_ps(_mm256_mul_ps(x0, b0), _mm256_mul_ps(y0, b1)); + t0 = _mm256_add_epi32(_mm256_cvtps_epi32(x0), preshift); + t0 = _mm256_add_epi16(_mm256_packs_epi32(t0, t0), postshift); + _mm_storel_epi64( (__m128i*)(dst + x), _mm256_extracti128_si256(t0, 0)); + _mm_storel_epi64( (__m128i*)(dst + x + 4), _mm256_extracti128_si256(t0, 1)); + } + + return x; +} +#endif + +template struct VResizeLinearVec_32f16 +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { +#if CV_AVX2 + if( checkHardwareSupport(CV_CPU_AVX2) ) + return VResizeLinearVec_32f16_avx2(_src, _dst, _beta, width); +#endif + if( checkHardwareSupport(CV_CPU_SSE2) ) + return VResizeLinearVec_32f16_sse2(_src, _dst, _beta, width); + + return 0; } }; typedef VResizeLinearVec_32f16 VResizeLinearVec_32f16u; typedef VResizeLinearVec_32f16<0> VResizeLinearVec_32f16s; -struct VResizeLinearVec_32f +static int VResizeLinearVec_32f_sse(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) { - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - if( !checkHardwareSupport(CV_CPU_SSE) ) - return 0; + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1]; + float* dst = (float*)_dst; + int x = 0; - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1]; - float* dst = (float*)_dst; - int x = 0; + __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]); - __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]); + if( (((size_t)S0|(size_t)S1)&15) == 0 ) + for( ; x <= width - 8; x += 8 ) + { + __m128 x0, x1, y0, y1; + x0 = _mm_load_ps(S0 + x); + x1 = _mm_load_ps(S0 + x + 4); + y0 = _mm_load_ps(S1 + x); + y1 = _mm_load_ps(S1 + x + 4); - if( (((size_t)S0|(size_t)S1)&15) == 0 ) - for( ; x <= width - 8; x += 8 ) - { - __m128 x0, x1, y0, y1; - x0 = _mm_load_ps(S0 + x); - x1 = _mm_load_ps(S0 + x + 4); - y0 = _mm_load_ps(S1 + x); - y1 = _mm_load_ps(S1 + x + 4); + x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); + x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); + + _mm_storeu_ps( dst + x, x0); + _mm_storeu_ps( dst + x + 4, x1); + } + else + for( ; x <= width - 8; x += 8 ) + { + __m128 x0, x1, y0, y1; + x0 = _mm_loadu_ps(S0 + x); + x1 = _mm_loadu_ps(S0 + x + 4); + y0 = _mm_loadu_ps(S1 + x); + y1 = _mm_loadu_ps(S1 + x + 4); - x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); - x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); + x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); + x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); - _mm_storeu_ps( dst + x, x0); - _mm_storeu_ps( dst + x + 4, x1); - } - else - for( ; x <= width - 8; x += 8 ) - { - __m128 x0, x1, y0, y1; - x0 = _mm_loadu_ps(S0 + x); - x1 = _mm_loadu_ps(S0 + x + 4); - y0 = _mm_loadu_ps(S1 + x); - y1 = _mm_loadu_ps(S1 + x + 4); + _mm_storeu_ps( dst + x, x0); + _mm_storeu_ps( dst + x + 4, x1); + } + + return x; +} - x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); - x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); +#if CV_AVX +int VResizeLinearVec_32f_avx(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) +{ + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1]; + float* dst = (float*)_dst; + int x = 0; - _mm_storeu_ps( dst + x, x0); - _mm_storeu_ps( dst + x + 4, x1); - } + __m256 b0 = _mm256_set1_ps(beta[0]), b1 = _mm256_set1_ps(beta[1]); - return x; + if( (((size_t)S0|(size_t)S1)&31) == 0 ) + for( ; x <= width - 16; x += 16 ) + { + __m256 x0, x1, y0, y1; + x0 = _mm256_load_ps(S0 + x); + x1 = _mm256_load_ps(S0 + x + 8); + y0 = _mm256_load_ps(S1 + x); + y1 = _mm256_load_ps(S1 + x + 8); + + x0 = _mm256_add_ps(_mm256_mul_ps(x0, b0), _mm256_mul_ps(y0, b1)); + x1 = _mm256_add_ps(_mm256_mul_ps(x1, b0), _mm256_mul_ps(y1, b1)); + + _mm256_storeu_ps( dst + x, x0); + _mm256_storeu_ps( dst + x + 8, x1); + } + else + for( ; x <= width - 16; x += 16 ) + { + __m256 x0, x1, y0, y1; + x0 = _mm256_loadu_ps(S0 + x); + x1 = _mm256_loadu_ps(S0 + x + 8); + y0 = _mm256_loadu_ps(S1 + x); + y1 = _mm256_loadu_ps(S1 + x + 8); + + x0 = _mm256_add_ps(_mm256_mul_ps(x0, b0), _mm256_mul_ps(y0, b1)); + x1 = _mm256_add_ps(_mm256_mul_ps(x1, b0), _mm256_mul_ps(y1, b1)); + + _mm256_storeu_ps( dst + x, x0); + _mm256_storeu_ps( dst + x + 8, x1); + } + + return x; +} +#endif + +struct VResizeLinearVec_32f +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { +#if CV_AVX + if( checkHardwareSupport(CV_CPU_AVX) ) + return VResizeLinearVec_32f_avx(_src, _dst, _beta, width); +#endif + if( checkHardwareSupport(CV_CPU_SSE) ) + return VResizeLinearVec_32f_sse(_src, _dst, _beta, width); + + return 0; } }; +static int VResizeCubicVec_32s8u_sse2(const uchar** _src, uchar* dst, const uchar* _beta, int width ) +{ + const int** src = (const int**)_src; + const short* beta = (const short*)_beta; + const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + int x = 0; + float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE); + __m128 b0 = _mm_set1_ps(beta[0]*scale), b1 = _mm_set1_ps(beta[1]*scale), + b2 = _mm_set1_ps(beta[2]*scale), b3 = _mm_set1_ps(beta[3]*scale); + + if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&15) == 0 ) + for( ; x <= width - 8; x += 8 ) + { + __m128i x0, x1, y0, y1; + __m128 s0, s1, f0, f1; + x0 = _mm_load_si128((const __m128i*)(S0 + x)); + x1 = _mm_load_si128((const __m128i*)(S0 + x + 4)); + y0 = _mm_load_si128((const __m128i*)(S1 + x)); + y1 = _mm_load_si128((const __m128i*)(S1 + x + 4)); + + s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0); + s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0); + f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1); + f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1); + s0 = _mm_add_ps(s0, f0); + s1 = _mm_add_ps(s1, f1); + + x0 = _mm_load_si128((const __m128i*)(S2 + x)); + x1 = _mm_load_si128((const __m128i*)(S2 + x + 4)); + y0 = _mm_load_si128((const __m128i*)(S3 + x)); + y1 = _mm_load_si128((const __m128i*)(S3 + x + 4)); + + f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2); + f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2); + s0 = _mm_add_ps(s0, f0); + s1 = _mm_add_ps(s1, f1); + f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3); + f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3); + s0 = _mm_add_ps(s0, f0); + s1 = _mm_add_ps(s1, f1); + + x0 = _mm_cvtps_epi32(s0); + x1 = _mm_cvtps_epi32(s1); + + x0 = _mm_packs_epi32(x0, x1); + _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0)); + } + else + for( ; x <= width - 8; x += 8 ) + { + __m128i x0, x1, y0, y1; + __m128 s0, s1, f0, f1; + x0 = _mm_loadu_si128((const __m128i*)(S0 + x)); + x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4)); + y0 = _mm_loadu_si128((const __m128i*)(S1 + x)); + y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4)); + + s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0); + s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0); + f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1); + f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1); + s0 = _mm_add_ps(s0, f0); + s1 = _mm_add_ps(s1, f1); + + x0 = _mm_loadu_si128((const __m128i*)(S2 + x)); + x1 = _mm_loadu_si128((const __m128i*)(S2 + x + 4)); + y0 = _mm_loadu_si128((const __m128i*)(S3 + x)); + y1 = _mm_loadu_si128((const __m128i*)(S3 + x + 4)); + + f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2); + f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2); + s0 = _mm_add_ps(s0, f0); + s1 = _mm_add_ps(s1, f1); + f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3); + f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3); + s0 = _mm_add_ps(s0, f0); + s1 = _mm_add_ps(s1, f1); + + x0 = _mm_cvtps_epi32(s0); + x1 = _mm_cvtps_epi32(s1); + + x0 = _mm_packs_epi32(x0, x1); + _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0)); + } + + return x; +} + +#if CV_AVX2 +int VResizeCubicVec_32s8u_avx2(const uchar** _src, uchar* dst, const uchar* _beta, int width ) +{ + const int** src = (const int**)_src; + const short* beta = (const short*)_beta; + const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + int x = 0; + float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE); + __m256 b0 = _mm256_set1_ps(beta[0]*scale), b1 = _mm256_set1_ps(beta[1]*scale), + b2 = _mm256_set1_ps(beta[2]*scale), b3 = _mm256_set1_ps(beta[3]*scale); + const int shuffle = 0xd8; // 11 | 01 | 10 | 00 + + if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&31) == 0 ) + for( ; x <= width - 16; x += 16 ) + { + __m256i x0, x1, y0, y1; + __m256 s0, s1, f0, f1; + x0 = _mm256_load_si256((const __m256i*)(S0 + x)); + x1 = _mm256_load_si256((const __m256i*)(S0 + x + 8)); + y0 = _mm256_load_si256((const __m256i*)(S1 + x)); + y1 = _mm256_load_si256((const __m256i*)(S1 + x + 8)); + + s0 = _mm256_mul_ps(_mm256_cvtepi32_ps(x0), b0); + s1 = _mm256_mul_ps(_mm256_cvtepi32_ps(x1), b0); + f0 = _mm256_mul_ps(_mm256_cvtepi32_ps(y0), b1); + f1 = _mm256_mul_ps(_mm256_cvtepi32_ps(y1), b1); + s0 = _mm256_add_ps(s0, f0); + s1 = _mm256_add_ps(s1, f1); + + x0 = _mm256_load_si256((const __m256i*)(S2 + x)); + x1 = _mm256_load_si256((const __m256i*)(S2 + x + 8)); + y0 = _mm256_load_si256((const __m256i*)(S3 + x)); + y1 = _mm256_load_si256((const __m256i*)(S3 + x + 8)); + + f0 = _mm256_mul_ps(_mm256_cvtepi32_ps(x0), b2); + f1 = _mm256_mul_ps(_mm256_cvtepi32_ps(x1), b2); + s0 = _mm256_add_ps(s0, f0); + s1 = _mm256_add_ps(s1, f1); + f0 = _mm256_mul_ps(_mm256_cvtepi32_ps(y0), b3); + f1 = _mm256_mul_ps(_mm256_cvtepi32_ps(y1), b3); + s0 = _mm256_add_ps(s0, f0); + s1 = _mm256_add_ps(s1, f1); + + x0 = _mm256_cvtps_epi32(s0); + x1 = _mm256_cvtps_epi32(s1); + + x0 = _mm256_packs_epi32(x0, x1); + x0 = _mm256_permute4x64_epi64(x0, shuffle); + x0 = _mm256_packus_epi16(x0, x0); + _mm_storel_epi64( (__m128i*)(dst + x), _mm256_extracti128_si256(x0, 0)); + _mm_storel_epi64( (__m128i*)(dst + x + 8), _mm256_extracti128_si256(x0, 1)); + } + else + for( ; x <= width - 16; x += 16 ) + { + __m256i x0, x1, y0, y1; + __m256 s0, s1, f0, f1; + x0 = _mm256_loadu_si256((const __m256i*)(S0 + x)); + x1 = _mm256_loadu_si256((const __m256i*)(S0 + x + 8)); + y0 = _mm256_loadu_si256((const __m256i*)(S1 + x)); + y1 = _mm256_loadu_si256((const __m256i*)(S1 + x + 8)); + + s0 = _mm256_mul_ps(_mm256_cvtepi32_ps(x0), b0); + s1 = _mm256_mul_ps(_mm256_cvtepi32_ps(x1), b0); + f0 = _mm256_mul_ps(_mm256_cvtepi32_ps(y0), b1); + f1 = _mm256_mul_ps(_mm256_cvtepi32_ps(y1), b1); + s0 = _mm256_add_ps(s0, f0); + s1 = _mm256_add_ps(s1, f1); + + x0 = _mm256_loadu_si256((const __m256i*)(S2 + x)); + x1 = _mm256_loadu_si256((const __m256i*)(S2 + x + 8)); + y0 = _mm256_loadu_si256((const __m256i*)(S3 + x)); + y1 = _mm256_loadu_si256((const __m256i*)(S3 + x + 8)); + + f0 = _mm256_mul_ps(_mm256_cvtepi32_ps(x0), b2); + f1 = _mm256_mul_ps(_mm256_cvtepi32_ps(x1), b2); + s0 = _mm256_add_ps(s0, f0); + s1 = _mm256_add_ps(s1, f1); + f0 = _mm256_mul_ps(_mm256_cvtepi32_ps(y0), b3); + f1 = _mm256_mul_ps(_mm256_cvtepi32_ps(y1), b3); + s0 = _mm256_add_ps(s0, f0); + s1 = _mm256_add_ps(s1, f1); + + x0 = _mm256_cvtps_epi32(s0); + x1 = _mm256_cvtps_epi32(s1); + + x0 = _mm256_packs_epi32(x0, x1); + x0 = _mm256_permute4x64_epi64(x0, shuffle); + x0 = _mm256_packus_epi16(x0, x0); + _mm_storel_epi64( (__m128i*)(dst + x), _mm256_extracti128_si256(x0, 0)); + _mm_storel_epi64( (__m128i*)(dst + x + 8), _mm256_extracti128_si256(x0, 1)); + } + + return x; +} +#endif + struct VResizeCubicVec_32s8u { int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const { - if( !checkHardwareSupport(CV_CPU_SSE2) ) - return 0; - - const int** src = (const int**)_src; - const short* beta = (const short*)_beta; - const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; - int x = 0; - float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE); - __m128 b0 = _mm_set1_ps(beta[0]*scale), b1 = _mm_set1_ps(beta[1]*scale), - b2 = _mm_set1_ps(beta[2]*scale), b3 = _mm_set1_ps(beta[3]*scale); - - if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&15) == 0 ) - for( ; x <= width - 8; x += 8 ) - { - __m128i x0, x1, y0, y1; - __m128 s0, s1, f0, f1; - x0 = _mm_load_si128((const __m128i*)(S0 + x)); - x1 = _mm_load_si128((const __m128i*)(S0 + x + 4)); - y0 = _mm_load_si128((const __m128i*)(S1 + x)); - y1 = _mm_load_si128((const __m128i*)(S1 + x + 4)); - - s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0); - s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0); - f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1); - f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1); - s0 = _mm_add_ps(s0, f0); - s1 = _mm_add_ps(s1, f1); - - x0 = _mm_load_si128((const __m128i*)(S2 + x)); - x1 = _mm_load_si128((const __m128i*)(S2 + x + 4)); - y0 = _mm_load_si128((const __m128i*)(S3 + x)); - y1 = _mm_load_si128((const __m128i*)(S3 + x + 4)); - - f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2); - f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2); - s0 = _mm_add_ps(s0, f0); - s1 = _mm_add_ps(s1, f1); - f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3); - f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3); - s0 = _mm_add_ps(s0, f0); - s1 = _mm_add_ps(s1, f1); - - x0 = _mm_cvtps_epi32(s0); - x1 = _mm_cvtps_epi32(s1); - - x0 = _mm_packs_epi32(x0, x1); - _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0)); - } - else - for( ; x <= width - 8; x += 8 ) - { - __m128i x0, x1, y0, y1; - __m128 s0, s1, f0, f1; - x0 = _mm_loadu_si128((const __m128i*)(S0 + x)); - x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4)); - y0 = _mm_loadu_si128((const __m128i*)(S1 + x)); - y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4)); - - s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0); - s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0); - f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1); - f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1); - s0 = _mm_add_ps(s0, f0); - s1 = _mm_add_ps(s1, f1); - - x0 = _mm_loadu_si128((const __m128i*)(S2 + x)); - x1 = _mm_loadu_si128((const __m128i*)(S2 + x + 4)); - y0 = _mm_loadu_si128((const __m128i*)(S3 + x)); - y1 = _mm_loadu_si128((const __m128i*)(S3 + x + 4)); - - f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2); - f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2); - s0 = _mm_add_ps(s0, f0); - s1 = _mm_add_ps(s1, f1); - f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3); - f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3); - s0 = _mm_add_ps(s0, f0); - s1 = _mm_add_ps(s1, f1); - - x0 = _mm_cvtps_epi32(s0); - x1 = _mm_cvtps_epi32(s1); - - x0 = _mm_packs_epi32(x0, x1); - _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0)); - } +#if CV_AVX2 + if( checkHardwareSupport(CV_CPU_AVX2) ) + return VResizeCubicVec_32s8u_avx2(_src, dst, _beta, width); +#endif + if( checkHardwareSupport(CV_CPU_SSE2) ) + return VResizeCubicVec_32s8u_sse2(_src, dst, _beta, width); - return x; + return 0; } }; -template struct VResizeCubicVec_32f16 +template +int VResizeCubicVec_32f16_sse2(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) { - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - if( !checkHardwareSupport(CV_CPU_SSE2) ) - return 0; + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + ushort* dst = (ushort*)_dst; + int x = 0; + __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]), + b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]); + __m128i preshift = _mm_set1_epi32(shiftval); + __m128i postshift = _mm_set1_epi16((short)shiftval); + + if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&15) == 0 ) + for( ; x <= width - 8; x += 8 ) + { + __m128 x0, x1, y0, y1, s0, s1; + __m128i t0, t1; + x0 = _mm_load_ps(S0 + x); + x1 = _mm_load_ps(S0 + x + 4); + y0 = _mm_load_ps(S1 + x); + y1 = _mm_load_ps(S1 + x + 4); - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; - ushort* dst = (ushort*)_dst; - int x = 0; - __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]), - b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]); - __m128i preshift = _mm_set1_epi32(shiftval); - __m128i postshift = _mm_set1_epi16((short)shiftval); + s0 = _mm_mul_ps(x0, b0); + s1 = _mm_mul_ps(x1, b0); + y0 = _mm_mul_ps(y0, b1); + y1 = _mm_mul_ps(y1, b1); + s0 = _mm_add_ps(s0, y0); + s1 = _mm_add_ps(s1, y1); + x0 = _mm_load_ps(S2 + x); + x1 = _mm_load_ps(S2 + x + 4); + y0 = _mm_load_ps(S3 + x); + y1 = _mm_load_ps(S3 + x + 4); + + x0 = _mm_mul_ps(x0, b2); + x1 = _mm_mul_ps(x1, b2); + y0 = _mm_mul_ps(y0, b3); + y1 = _mm_mul_ps(y1, b3); + s0 = _mm_add_ps(s0, x0); + s1 = _mm_add_ps(s1, x1); + s0 = _mm_add_ps(s0, y0); + s1 = _mm_add_ps(s1, y1); + + t0 = _mm_add_epi32(_mm_cvtps_epi32(s0), preshift); + t1 = _mm_add_epi32(_mm_cvtps_epi32(s1), preshift); + + t0 = _mm_add_epi16(_mm_packs_epi32(t0, t1), postshift); + _mm_storeu_si128( (__m128i*)(dst + x), t0); + } + else for( ; x <= width - 8; x += 8 ) { __m128 x0, x1, y0, y1, s0, s1; @@ -832,28 +1227,167 @@ template struct VResizeCubicVec_32f16 _mm_storeu_si128( (__m128i*)(dst + x), t0); } - return x; + return x; +} + +#if CV_AVX2 +template +int VResizeCubicVec_32f16_avx2(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) +{ + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + ushort* dst = (ushort*)_dst; + int x = 0; + __m256 b0 = _mm256_set1_ps(beta[0]), b1 = _mm256_set1_ps(beta[1]), + b2 = _mm256_set1_ps(beta[2]), b3 = _mm256_set1_ps(beta[3]); + __m256i preshift = _mm256_set1_epi32(shiftval); + __m256i postshift = _mm256_set1_epi16((short)shiftval); + const int shuffle = 0xd8; // 11 | 01 | 10 | 00 + + if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&31) == 0 ) + for( ; x <= width - 16; x += 16 ) + { + __m256 x0, x1, y0, y1, s0, s1; + __m256i t0, t1; + x0 = _mm256_load_ps(S0 + x); + x1 = _mm256_load_ps(S0 + x + 8); + y0 = _mm256_load_ps(S1 + x); + y1 = _mm256_load_ps(S1 + x + 8); + + s0 = _mm256_mul_ps(x0, b0); + s1 = _mm256_mul_ps(x1, b0); + y0 = _mm256_mul_ps(y0, b1); + y1 = _mm256_mul_ps(y1, b1); + s0 = _mm256_add_ps(s0, y0); + s1 = _mm256_add_ps(s1, y1); + + x0 = _mm256_load_ps(S2 + x); + x1 = _mm256_load_ps(S2 + x + 8); + y0 = _mm256_load_ps(S3 + x); + y1 = _mm256_load_ps(S3 + x + 8); + + x0 = _mm256_mul_ps(x0, b2); + x1 = _mm256_mul_ps(x1, b2); + y0 = _mm256_mul_ps(y0, b3); + y1 = _mm256_mul_ps(y1, b3); + s0 = _mm256_add_ps(s0, x0); + s1 = _mm256_add_ps(s1, x1); + s0 = _mm256_add_ps(s0, y0); + s1 = _mm256_add_ps(s1, y1); + + t0 = _mm256_add_epi32(_mm256_cvtps_epi32(s0), preshift); + t1 = _mm256_add_epi32(_mm256_cvtps_epi32(s1), preshift); + + t0 = _mm256_add_epi16(_mm256_packs_epi32(t0, t1), postshift); + t0 = _mm256_permute4x64_epi64(t0, shuffle); + _mm256_storeu_si256( (__m256i*)(dst + x), t0); + } + else + for( ; x <= width - 16; x += 16 ) + { + __m256 x0, x1, y0, y1, s0, s1; + __m256i t0, t1; + x0 = _mm256_loadu_ps(S0 + x); + x1 = _mm256_loadu_ps(S0 + x + 8); + y0 = _mm256_loadu_ps(S1 + x); + y1 = _mm256_loadu_ps(S1 + x + 8); + + s0 = _mm256_mul_ps(x0, b0); + s1 = _mm256_mul_ps(x1, b0); + y0 = _mm256_mul_ps(y0, b1); + y1 = _mm256_mul_ps(y1, b1); + s0 = _mm256_add_ps(s0, y0); + s1 = _mm256_add_ps(s1, y1); + + x0 = _mm256_loadu_ps(S2 + x); + x1 = _mm256_loadu_ps(S2 + x + 8); + y0 = _mm256_loadu_ps(S3 + x); + y1 = _mm256_loadu_ps(S3 + x + 8); + + x0 = _mm256_mul_ps(x0, b2); + x1 = _mm256_mul_ps(x1, b2); + y0 = _mm256_mul_ps(y0, b3); + y1 = _mm256_mul_ps(y1, b3); + s0 = _mm256_add_ps(s0, x0); + s1 = _mm256_add_ps(s1, x1); + s0 = _mm256_add_ps(s0, y0); + s1 = _mm256_add_ps(s1, y1); + + t0 = _mm256_add_epi32(_mm256_cvtps_epi32(s0), preshift); + t1 = _mm256_add_epi32(_mm256_cvtps_epi32(s1), preshift); + + t0 = _mm256_add_epi16(_mm256_packs_epi32(t0, t1), postshift); + t0 = _mm256_permute4x64_epi64(t0, shuffle); + _mm256_storeu_si256( (__m256i*)(dst + x), t0); + } + + return x; +} +#endif + +template struct VResizeCubicVec_32f16 +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { +#if CV_AVX2 + if( checkHardwareSupport(CV_CPU_AVX2) ) + return VResizeCubicVec_32f16_avx2(_src, _dst, _beta, width); +#endif + if( checkHardwareSupport(CV_CPU_SSE2) ) + return VResizeCubicVec_32f16_sse2(_src, _dst, _beta, width); + + return 0; } }; typedef VResizeCubicVec_32f16 VResizeCubicVec_32f16u; typedef VResizeCubicVec_32f16<0> VResizeCubicVec_32f16s; -struct VResizeCubicVec_32f +static int VResizeCubicVec_32f_sse(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) { - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - if( !checkHardwareSupport(CV_CPU_SSE) ) - return 0; + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + float* dst = (float*)_dst; + int x = 0; + __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]), + b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]); + + if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&15) == 0 ) + for( ; x <= width - 8; x += 8 ) + { + __m128 x0, x1, y0, y1, s0, s1; + x0 = _mm_load_ps(S0 + x); + x1 = _mm_load_ps(S0 + x + 4); + y0 = _mm_load_ps(S1 + x); + y1 = _mm_load_ps(S1 + x + 4); + + s0 = _mm_mul_ps(x0, b0); + s1 = _mm_mul_ps(x1, b0); + y0 = _mm_mul_ps(y0, b1); + y1 = _mm_mul_ps(y1, b1); + s0 = _mm_add_ps(s0, y0); + s1 = _mm_add_ps(s1, y1); - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; - float* dst = (float*)_dst; - int x = 0; - __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]), - b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]); + x0 = _mm_load_ps(S2 + x); + x1 = _mm_load_ps(S2 + x + 4); + y0 = _mm_load_ps(S3 + x); + y1 = _mm_load_ps(S3 + x + 4); + x0 = _mm_mul_ps(x0, b2); + x1 = _mm_mul_ps(x1, b2); + y0 = _mm_mul_ps(y0, b3); + y1 = _mm_mul_ps(y1, b3); + s0 = _mm_add_ps(s0, x0); + s1 = _mm_add_ps(s1, x1); + s0 = _mm_add_ps(s0, y0); + s1 = _mm_add_ps(s1, y1); + + _mm_storeu_ps( dst + x, s0); + _mm_storeu_ps( dst + x + 4, s1); + } + else for( ; x <= width - 8; x += 8 ) { __m128 x0, x1, y0, y1, s0, s1; @@ -887,7 +1421,103 @@ struct VResizeCubicVec_32f _mm_storeu_ps( dst + x + 4, s1); } - return x; + return x; +} + +#if CV_AVX +int VResizeCubicVec_32f_avx(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) +{ + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + float* dst = (float*)_dst; + int x = 0; + __m256 b0 = _mm256_set1_ps(beta[0]), b1 = _mm256_set1_ps(beta[1]), + b2 = _mm256_set1_ps(beta[2]), b3 = _mm256_set1_ps(beta[3]); + + if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&31) == 0 ) + for( ; x <= width - 16; x += 16 ) + { + __m256 x0, x1, y0, y1, s0, s1; + x0 = _mm256_load_ps(S0 + x); + x1 = _mm256_load_ps(S0 + x + 8); + y0 = _mm256_load_ps(S1 + x); + y1 = _mm256_load_ps(S1 + x + 8); + + s0 = _mm256_mul_ps(x0, b0); + s1 = _mm256_mul_ps(x1, b0); + y0 = _mm256_mul_ps(y0, b1); + y1 = _mm256_mul_ps(y1, b1); + s0 = _mm256_add_ps(s0, y0); + s1 = _mm256_add_ps(s1, y1); + + x0 = _mm256_load_ps(S2 + x); + x1 = _mm256_load_ps(S2 + x + 8); + y0 = _mm256_load_ps(S3 + x); + y1 = _mm256_load_ps(S3 + x + 8); + + x0 = _mm256_mul_ps(x0, b2); + x1 = _mm256_mul_ps(x1, b2); + y0 = _mm256_mul_ps(y0, b3); + y1 = _mm256_mul_ps(y1, b3); + s0 = _mm256_add_ps(s0, x0); + s1 = _mm256_add_ps(s1, x1); + s0 = _mm256_add_ps(s0, y0); + s1 = _mm256_add_ps(s1, y1); + + _mm256_storeu_ps( dst + x, s0); + _mm256_storeu_ps( dst + x + 8, s1); + } + else + for( ; x <= width - 16; x += 16 ) + { + __m256 x0, x1, y0, y1, s0, s1; + x0 = _mm256_loadu_ps(S0 + x); + x1 = _mm256_loadu_ps(S0 + x + 8); + y0 = _mm256_loadu_ps(S1 + x); + y1 = _mm256_loadu_ps(S1 + x + 8); + + s0 = _mm256_mul_ps(x0, b0); + s1 = _mm256_mul_ps(x1, b0); + y0 = _mm256_mul_ps(y0, b1); + y1 = _mm256_mul_ps(y1, b1); + s0 = _mm256_add_ps(s0, y0); + s1 = _mm256_add_ps(s1, y1); + + x0 = _mm256_loadu_ps(S2 + x); + x1 = _mm256_loadu_ps(S2 + x + 8); + y0 = _mm256_loadu_ps(S3 + x); + y1 = _mm256_loadu_ps(S3 + x + 8); + + x0 = _mm256_mul_ps(x0, b2); + x1 = _mm256_mul_ps(x1, b2); + y0 = _mm256_mul_ps(y0, b3); + y1 = _mm256_mul_ps(y1, b3); + s0 = _mm256_add_ps(s0, x0); + s1 = _mm256_add_ps(s1, x1); + s0 = _mm256_add_ps(s0, y0); + s1 = _mm256_add_ps(s1, y1); + + _mm256_storeu_ps( dst + x, s0); + _mm256_storeu_ps( dst + x + 8, s1); + } + + return x; +} +#endif + +struct VResizeCubicVec_32f +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { +#if CV_AVX + if( checkHardwareSupport(CV_CPU_AVX) ) + return VResizeCubicVec_32f_avx(_src, _dst, _beta, width); +#endif + if( checkHardwareSupport(CV_CPU_SSE) ) + return VResizeCubicVec_32f_sse(_src, _dst, _beta, width); + + return 0; } }; diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp index 44f3e483fd..39907edac4 100644 --- a/modules/ts/src/ts_func.cpp +++ b/modules/ts/src/ts_func.cpp @@ -3005,6 +3005,9 @@ void printVersionInfo(bool useStdOut) #if CV_AVX if (checkHardwareSupport(CV_CPU_AVX)) cpu_features += " avx"; #endif +#if CV_AVX2 + if (checkHardwareSupport(CV_CPU_AVX2)) cpu_features += " avx2"; +#endif #if CV_NEON cpu_features += " neon"; // NEON is currently not checked at runtime #endif