From f2cd65cf1e888ff7b48e67578fba59c2d533bc85 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:30 +0300 Subject: [PATCH] fixes --- cmake/OpenCVCompilerOptions.cmake | 7 +- modules/core/include/opencv2/core/cvdef.h | 8 +- .../core/include/opencv2/core/sse_utils.hpp | 82 +++++++++---------- modules/core/src/arithm.cpp | 2 +- modules/core/src/convert.cpp | 7 +- modules/imgproc/src/color.cpp | 6 +- modules/photo/test/test_cloning.cpp | 28 +++++-- 7 files changed, 78 insertions(+), 62 deletions(-) diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake index bbe617dd69..66e16e7863 100644 --- a/cmake/OpenCVCompilerOptions.cmake +++ b/cmake/OpenCVCompilerOptions.cmake @@ -224,7 +224,7 @@ if(MSVC) set(OPENCV_EXTRA_FLAGS_RELEASE "${OPENCV_EXTRA_FLAGS_RELEASE} /Zi") endif() - if(ENABLE_AVX2 AND NOT MSVC_VERSION LESS 1600) + if(ENABLE_AVX2 AND NOT MSVC_VERSION LESS 1800) set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:AVX2") endif() if(ENABLE_AVX AND NOT MSVC_VERSION LESS 1600 AND NOT OPENCV_EXTRA_FLAGS MATCHES "/arch:") @@ -309,7 +309,7 @@ if(MSVC) string(REPLACE "/W3" "/W4" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") if(NOT ENABLE_NOISY_WARNINGS AND MSVC_VERSION EQUAL 1400) - ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4510 /wd4610 /wd4312 /wd4201 /wd4244 /wd4328 /wd4267 /wd4324) + ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4510 /wd4610 /wd4312 /wd4201 /wd4244 /wd4328 /wd4267) endif() # allow extern "C" functions throw exceptions @@ -321,6 +321,7 @@ if(MSVC) endforeach() if(NOT ENABLE_NOISY_WARNINGS) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4251") #class 'std::XXX' needs to have dll-interface to be used by clients of YYY + ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4251) # class 'std::XXX' needs to have dll-interface to be used by clients of YYY + ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4324) # 'struct_name' : structure was padded due to __declspec(align()) endif() endif() diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index 610c3fbad0..a9d59c7693 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -159,12 +159,14 @@ # define CV_SSE4_2 1 # endif # if defined __POPCNT__ || (defined _MSC_VER && _MSC_VER >= 1500) -# ifndef _MSC_VER +# ifdef _MSC_VER +# include +# else # include # endif # define CV_POPCNT 1 # endif -# if defined __AVX__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219) +# if defined __AVX__ || (defined _MSC_VER && _MSC_VER >= 1600) // MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX // See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32 # include @@ -175,7 +177,7 @@ # define __xgetbv() 0 # endif # endif -# if defined __AVX2__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219) +# if defined __AVX2__ || (defined _MSC_VER && _MSC_VER >= 1800) # include # define CV_AVX2 1 # if defined __FMA__ diff --git a/modules/core/include/opencv2/core/sse_utils.hpp b/modules/core/include/opencv2/core/sse_utils.hpp index 7af6d84f2d..e0283eb3f3 100644 --- a/modules/core/include/opencv2/core/sse_utils.hpp +++ b/modules/core/include/opencv2/core/sse_utils.hpp @@ -43,7 +43,7 @@ #define __OPENCV_CORE_SSE_UTILS_HPP__ #ifndef __cplusplus -# error base.hpp header must be compiled as C++ +# error sse_utils.hpp header must be compiled as C++ #endif #if CV_SSE2 @@ -117,7 +117,7 @@ inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) -{ +{ __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_b0); __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_b0); __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b1); @@ -165,9 +165,9 @@ inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0 } inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) -{ +{ __m128i v_mask = _mm_set1_epi16(0x00ff); - + __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); __m128i layer4_chunk2 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); @@ -177,28 +177,28 @@ inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i layer3_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8)); __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask)); __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8)); - + __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); __m128i layer2_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8)); __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8)); - + __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); __m128i layer1_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8)); __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8)); - + v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); v_g0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); } -inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, +inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) -{ +{ __m128i v_mask = _mm_set1_epi16(0x00ff); - + __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); @@ -237,7 +237,7 @@ inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) -{ +{ __m128i v_mask = _mm_set1_epi16(0x00ff); __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); @@ -286,8 +286,8 @@ inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8)); } -inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) -{ +inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) +{ __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g0); __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g0); __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_g1); @@ -310,8 +310,8 @@ inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g } inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, - __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) -{ + __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) +{ __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g1); __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g1); __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b0); @@ -342,7 +342,7 @@ inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g } inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, - __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) + __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) { __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_b0); __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_b0); @@ -352,7 +352,7 @@ inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_a0); __m128i layer1_chunk6 = _mm_unpacklo_epi16(v_g1, v_a1); __m128i layer1_chunk7 = _mm_unpackhi_epi16(v_g1, v_a1); - + __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk4); __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk4); __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk5); @@ -393,14 +393,14 @@ inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16)); __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); - __m128i layer2_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); + __m128i layer2_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); - __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); + __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); - __m128i layer1_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); + __m128i layer1_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); - __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); + __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); v_g0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); @@ -421,18 +421,18 @@ inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16)); __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); - __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); + __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); - __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); + __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); - __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16)); + __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16)); __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); - __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); + __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); - __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); + __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); - __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16)); + __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16)); v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); @@ -457,26 +457,26 @@ inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i layer3_chunk7 = _mm_packus_epi32(_mm_srli_epi32(v_a0, 16), _mm_srli_epi32(v_a1, 16)); __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); - __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); + __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); - __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); + __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); - __m128i layer2_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16)); + __m128i layer2_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16)); __m128i layer2_chunk3 = _mm_packus_epi32(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask)); - __m128i layer2_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk6, 16), _mm_srli_epi32(layer3_chunk7, 16)); + __m128i layer2_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk6, 16), _mm_srli_epi32(layer3_chunk7, 16)); __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); - __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); + __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); - __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); + __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); - __m128i layer1_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16)); + __m128i layer1_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16)); __m128i layer1_chunk3 = _mm_packus_epi32(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask)); - __m128i layer1_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk6, 16), _mm_srli_epi32(layer2_chunk7, 16)); + __m128i layer1_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk6, 16), _mm_srli_epi32(layer2_chunk7, 16)); v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); - v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); + v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16)); v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); v_a0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16)); @@ -487,12 +487,12 @@ inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, #endif // CV_SSE4_1 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1) -{ +{ __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g0); __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g0); __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_g1); __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_g1); - + __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk2); __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk2); __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk3); @@ -506,14 +506,14 @@ inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1, __m128 & v_b0, __m128 & v_b1) -{ +{ __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g1); __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g1); __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b0); __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b0); __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_b1); __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_b1); - + __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk3); __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk3); __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk4); @@ -531,7 +531,7 @@ inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1, __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1) -{ +{ __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_b0); __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_b0); __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b1); diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index 8f490a9c95..4c14732e79 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -3476,7 +3476,7 @@ struct Cmp_SIMD haveSSE = checkHardwareSupport(CV_CPU_SSE2); - v_mask = _mm_set1_epi8(0xff); + v_mask = _mm_set1_epi8(-1); } int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index a48e90e452..626a666a95 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -616,18 +616,17 @@ struct VMerge4 bool support; \ } -MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); +MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128); MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128); MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128); -MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); - #if CV_SSE4_1 +MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128); MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128); MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128); #endif -MERGE2_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); +MERGE2_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps); MERGE3_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps); MERGE4_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps); diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 7197627b20..675d6b9089 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -1460,9 +1460,9 @@ struct RGB2Gray if( blueIdx == 0 ) std::swap(coeffs[0], coeffs[2]); - v_cb = _mm_set1_epi16(coeffs[0]); - v_cg = _mm_set1_epi16(coeffs[1]); - v_cr = _mm_set1_epi16(coeffs[2]); + v_cb = _mm_set1_epi16((short)coeffs[0]); + v_cg = _mm_set1_epi16((short)coeffs[1]); + v_cr = _mm_set1_epi16((short)coeffs[2]); v_delta = _mm_set1_epi32(1 << (yuv_shift - 1)); } diff --git a/modules/photo/test/test_cloning.cpp b/modules/photo/test/test_cloning.cpp index 56d166205c..1f86612a4a 100644 --- a/modules/photo/test/test_cloning.cpp +++ b/modules/photo/test/test_cloning.cpp @@ -64,6 +64,7 @@ TEST(Photo_SeamlessClone_normal, regression) string original_path1 = folder + "source1.png"; string original_path2 = folder + "destination1.png"; string original_path3 = folder + "mask.png"; + string reference_path = folder + "reference.png"; Mat source = imread(original_path1, IMREAD_COLOR); Mat destination = imread(original_path2, IMREAD_COLOR); @@ -79,8 +80,8 @@ TEST(Photo_SeamlessClone_normal, regression) p.y = destination.size().height/2; seamlessClone(source, destination, mask, p, result, 1); - - Mat reference = imread(folder + "reference.png"); + Mat reference = imread(reference_path); + ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path; SAVE(result); @@ -94,6 +95,7 @@ TEST(Photo_SeamlessClone_mixed, regression) string original_path1 = folder + "source1.png"; string original_path2 = folder + "destination1.png"; string original_path3 = folder + "mask.png"; + string reference_path = folder + "reference.png"; Mat source = imread(original_path1, IMREAD_COLOR); Mat destination = imread(original_path2, IMREAD_COLOR); @@ -111,7 +113,9 @@ TEST(Photo_SeamlessClone_mixed, regression) SAVE(result); - Mat reference = imread(folder + "reference.png"); + Mat reference = imread(reference_path); + ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path; + double error = cvtest::norm(reference, result, NORM_L1); EXPECT_LE(error, numerical_precision); @@ -123,6 +127,7 @@ TEST(Photo_SeamlessClone_featureExchange, regression) string original_path1 = folder + "source1.png"; string original_path2 = folder + "destination1.png"; string original_path3 = folder + "mask.png"; + string reference_path = folder + "reference.png"; Mat source = imread(original_path1, IMREAD_COLOR); Mat destination = imread(original_path2, IMREAD_COLOR); @@ -140,7 +145,9 @@ TEST(Photo_SeamlessClone_featureExchange, regression) SAVE(result); - Mat reference = imread(folder + "reference.png"); + Mat reference = imread(reference_path); + ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path; + double error = cvtest::norm(reference, result, NORM_L1); EXPECT_LE(error, numerical_precision); @@ -151,6 +158,7 @@ TEST(Photo_SeamlessClone_colorChange, regression) string folder = string(cvtest::TS::ptr()->get_data_path()) + "cloning/color_change/"; string original_path1 = folder + "source1.png"; string original_path2 = folder + "mask.png"; + string reference_path = folder + "reference.png"; Mat source = imread(original_path1, IMREAD_COLOR); Mat mask = imread(original_path2, IMREAD_COLOR); @@ -163,7 +171,9 @@ TEST(Photo_SeamlessClone_colorChange, regression) SAVE(result); - Mat reference = imread(folder + "reference.png"); + Mat reference = imread(reference_path); + ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path; + double error = cvtest::norm(reference, result, NORM_L1); EXPECT_LE(error, numerical_precision); @@ -174,6 +184,7 @@ TEST(Photo_SeamlessClone_illuminationChange, regression) string folder = string(cvtest::TS::ptr()->get_data_path()) + "cloning/Illumination_Change/"; string original_path1 = folder + "source1.png"; string original_path2 = folder + "mask.png"; + string reference_path = folder + "reference.png"; Mat source = imread(original_path1, IMREAD_COLOR); Mat mask = imread(original_path2, IMREAD_COLOR); @@ -186,7 +197,7 @@ TEST(Photo_SeamlessClone_illuminationChange, regression) SAVE(result); - Mat reference = imread(folder + "reference.png"); + Mat reference = imread(reference_path); double error = cvtest::norm(reference, result, NORM_L1); EXPECT_LE(error, numerical_precision); @@ -197,6 +208,7 @@ TEST(Photo_SeamlessClone_textureFlattening, regression) string folder = string(cvtest::TS::ptr()->get_data_path()) + "cloning/Texture_Flattening/"; string original_path1 = folder + "source1.png"; string original_path2 = folder + "mask.png"; + string reference_path = folder + "reference.png"; Mat source = imread(original_path1, IMREAD_COLOR); Mat mask = imread(original_path2, IMREAD_COLOR); @@ -209,7 +221,9 @@ TEST(Photo_SeamlessClone_textureFlattening, regression) SAVE(result); - Mat reference = imread(folder + "reference.png"); + Mat reference = imread(reference_path); + ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path; + double error = cvtest::norm(reference, result, NORM_L1); EXPECT_LE(error, numerical_precision);