From fadf25acd6e255ed03303d9354b0e0bd93325e3c Mon Sep 17 00:00:00 2001 From: Vitaly Tuzov Date: Mon, 3 Jul 2017 18:21:22 +0300 Subject: [PATCH] SSE4_1 optimized implementation of resize and warp functions migrated to separate file --- modules/imgproc/src/imgwarp.avx2.cpp | 12 +- modules/imgproc/src/imgwarp.cpp | 435 ++-------------------- modules/imgproc/src/imgwarp.hpp | 18 +- modules/imgproc/src/imgwarp.sse4_1.cpp | 489 ++++++++++++++++++++++++- 4 files changed, 548 insertions(+), 406 deletions(-) diff --git a/modules/imgproc/src/imgwarp.avx2.cpp b/modules/imgproc/src/imgwarp.avx2.cpp index 321957f39e..d434807706 100644 --- a/modules/imgproc/src/imgwarp.avx2.cpp +++ b/modules/imgproc/src/imgwarp.avx2.cpp @@ -83,7 +83,9 @@ public: uchar* Dstart = D; int sy = std::min(cvFloor(y*ify), ssize.height-1); const uchar* S = src.data + sy*src.step; +#ifdef CV_ICC #pragma unroll(4) +#endif for(x = 0; x < avxWidth; x += 8) { const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); @@ -106,7 +108,9 @@ public: uchar* Dstart = D; int sy = std::min(cvFloor(y*ify), ssize.height-1); const uchar* S = src.data + sy*src.step; +#ifdef CV_ICC #pragma unroll(4) +#endif for(x = 0; x < avxWidth; x += 8) { const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); @@ -157,8 +161,8 @@ public: const __m256i CV_DECL_ALIGNED(64) shuffle_mask = _mm256_set_epi8(15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0, 15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0); const __m256i CV_DECL_ALIGNED(64) permute_mask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0); - const __m256i CV_DECL_ALIGNED(64) shift_shuffle_mask = _mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2, - 13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2); + //const __m256i CV_DECL_ALIGNED(64) shift_shuffle_mask = _mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2, + // 13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2); if(((int64)(dst.data + dst.step) & 0x1f) == 0) { for(y = range.start; y < range.end; y++) @@ -168,7 +172,9 @@ public: int sy = std::min(cvFloor(y*ify), ssize.height-1); const uchar* S = src.data + sy*src.step; const uchar* S2 = S - 2; +#ifdef CV_ICC #pragma unroll(4) +#endif for(x = 0; x < avxWidth; x += 16) { const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); @@ -200,7 +206,9 @@ public: int sy = std::min(cvFloor(y*ify), ssize.height-1); const uchar* S = src.data + sy*src.step; const uchar* S2 = S - 2; +#ifdef CV_ICC #pragma unroll(4) +#endif for(x = 0; x < avxWidth; x += 16) { const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index 4f040a91cc..a960c01ffb 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -450,9 +450,9 @@ resizeNN( const Mat& src, Mat& dst, double fx, double fy ) if(CV_CPU_HAS_SUPPORT_SSE4_1 && ((pix_size == 2) || (pix_size == 4))) { if(pix_size == 2) - opt_SSE41::resizeNN2_SSE4_1(range, src, dst, x_ofs, pix_size4, ify); + opt_SSE4_1::resizeNN2_SSE4_1(range, src, dst, x_ofs, pix_size4, ify); else - opt_SSE41::resizeNN4_SSE4_1(range, src, dst, x_ofs, pix_size4, ify); + opt_SSE4_1::resizeNN4_SSE4_1(range, src, dst, x_ofs, pix_size4, ify); } else #endif @@ -916,50 +916,14 @@ struct VResizeCubicVec_32f } }; -#if CV_SSE4_1 +#if CV_TRY_SSE4_1 struct VResizeLanczos4Vec_32f16u { int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const { - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], - *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; - short * dst = (short*)_dst; - int x = 0; - __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]), - v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]), - v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]), - v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]); - - for( ; x <= width - 8; x += 8 ) - { - __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x)); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x))); - - __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4)); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4))); - - __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0); - __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1); - - _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(v_dsti0, v_dsti1)); - } - - return x; + if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::VResizeLanczos4Vec_32f16u_SSE41(_src, _dst, _beta, width); + else return 0; } }; @@ -5149,8 +5113,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, #if CV_SSE2 bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2); #endif -#if CV_SSE4_1 - bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1); +#if CV_TRY_SSE4_1 + bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1; #endif const float scale = 1.f/INTER_TAB_SIZE; @@ -5183,29 +5147,10 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, vst2q_s16(dst1 + (x << 1), v_dst); } - #elif CV_SSE4_1 + #elif CV_TRY_SSE4_1 if (useSSE4_1) - { - for( ; x <= size.width - 16; x += 16 ) - { - __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)), - _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4))); - __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 8)), - _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 12))); - - __m128i v_dst2 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x)), - _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 4))); - __m128i v_dst3 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 8)), - _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 12))); - - _mm_interleave_epi16(v_dst0, v_dst1, v_dst2, v_dst3); - - _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst0); - _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst1); - _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst2); - _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst3); - } - } + opt_SSE4_1::convertMaps_nninterpolate32f1c16s_SSE41(src1f, src2f, dst1, size.width); + else #endif for( ; x < size.width; x++ ) { @@ -5240,52 +5185,10 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, vandq_s32(v_ix1, v_mask))); vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1)); } - #elif CV_SSE4_1 + #elif CV_TRY_SSE4_1 if (useSSE4_1) - { - __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE); - __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1); - - for( ; x <= size.width - 16; x += 16 ) - { - __m128i v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x), v_its)); - __m128i v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 4), v_its)); - __m128i v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x), v_its)); - __m128i v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 4), v_its)); - - __m128i v_dst10 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS), - _mm_srai_epi32(v_ix1, INTER_BITS)); - __m128i v_dst12 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS), - _mm_srai_epi32(v_iy1, INTER_BITS)); - __m128i v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS), - _mm_and_si128(v_ix0, v_its1)); - __m128i v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS), - _mm_and_si128(v_ix1, v_its1)); - _mm_storeu_si128((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst20, v_dst21)); - - v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 8), v_its)); - v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 12), v_its)); - v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 8), v_its)); - v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 12), v_its)); - - __m128i v_dst11 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS), - _mm_srai_epi32(v_ix1, INTER_BITS)); - __m128i v_dst13 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS), - _mm_srai_epi32(v_iy1, INTER_BITS)); - v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS), - _mm_and_si128(v_ix0, v_its1)); - v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS), - _mm_and_si128(v_ix1, v_its1)); - _mm_storeu_si128((__m128i *)(dst2 + x + 8), _mm_packus_epi32(v_dst20, v_dst21)); - - _mm_interleave_epi16(v_dst10, v_dst11, v_dst12, v_dst13); - - _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst10); - _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst11); - _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst12); - _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst13); - } - } + opt_SSE4_1::convertMaps_32f1c16s_SSE41(src1f, src2f, dst1, dst2, size.width); + else #endif for( ; x < size.width; x++ ) { @@ -5346,30 +5249,10 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, vandq_s32(v_ix1, v_mask))); vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1)); } - #elif CV_SSE4_1 + #elif CV_TRY_SSE4_1 if (useSSE4_1) - { - __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE); - __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1); - __m128i v_y_mask = _mm_set1_epi32((INTER_TAB_SIZE-1) << 16); - - for( ; x <= size.width - 4; x += 4 ) - { - __m128i v_src0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2), v_its)); - __m128i v_src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2 + 4), v_its)); - - __m128i v_dst1 = _mm_packs_epi32(_mm_srai_epi32(v_src0, INTER_BITS), - _mm_srai_epi32(v_src1, INTER_BITS)); - _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst1); - - // x0 y0 x1 y1 . . . - v_src0 = _mm_packs_epi32(_mm_and_si128(v_src0, v_its1), - _mm_and_si128(v_src1, v_its1)); - __m128i v_dst2 = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(v_src0, v_y_mask), 16 - INTER_BITS), // y0 0 y1 0 . . . - _mm_and_si128(v_src0, v_its1)); // 0 x0 0 x1 . . . - _mm_storel_epi64((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst2, v_dst2)); - } - } + opt_SSE4_1::convertMaps_32f2c16s_SSE41(src1f, dst1, dst2, size.width); + else #endif for( ; x < size.width; x++ ) { @@ -5557,8 +5440,8 @@ public: #if CV_SSE2 bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2); #endif - #if CV_SSE4_1 - bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1); + #if CV_TRY_SSE4_1 + bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1; #endif int bh0 = std::min(BLOCK_SZ/2, dst.rows); @@ -5596,31 +5479,10 @@ public: vst2q_s16(xy + (x1 << 1), v_dst); } - #elif CV_SSE4_1 + #elif CV_TRY_SSE4_1 if (useSSE4_1) - { - __m128i v_X0 = _mm_set1_epi32(X0); - __m128i v_Y0 = _mm_set1_epi32(Y0); - for ( ; x1 <= bw - 16; x1 += 16) - { - __m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1))), AB_BITS), - _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 4))), AB_BITS)); - __m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 8))), AB_BITS), - _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 12))), AB_BITS)); - - __m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1))), AB_BITS), - _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 4))), AB_BITS)); - __m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 8))), AB_BITS), - _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 12))), AB_BITS)); - - _mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1); - - _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1); - } - } + opt_SSE4_1::WarpAffineInvoker_Blockline_SSE41(adelta + x, bdelta + x, xy, X0, Y0, bw); + else #endif for( ; x1 < bw; x1++ ) { @@ -6132,18 +5994,10 @@ public: int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width); bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height); - #if CV_SSE4_1 - bool haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1); - __m128d v_M0 = _mm_set1_pd(M[0]); - __m128d v_M3 = _mm_set1_pd(M[3]); - __m128d v_M6 = _mm_set1_pd(M[6]); - __m128d v_intmax = _mm_set1_pd((double)INT_MAX); - __m128d v_intmin = _mm_set1_pd((double)INT_MIN); - __m128d v_2 = _mm_set1_pd(2), - v_zero = _mm_setzero_pd(), - v_1 = _mm_set1_pd(1), - v_its = _mm_set1_pd(INTER_TAB_SIZE); - __m128i v_itsi1 = _mm_set1_epi32(INTER_TAB_SIZE - 1); + #if CV_TRY_SSE4_1 + Ptr pwarp_impl_sse4; + if(CV_CPU_HAS_SUPPORT_SSE4_1) + pwarp_impl_sse4 = opt_SSE4_1::WarpPerspectiveLine_SSE4::getImpl(M); #endif for( y = range.start; y < range.end; y += bh0 ) @@ -6167,116 +6021,11 @@ public: { x1 = 0; - #if CV_SSE4_1 - if (haveSSE4_1) - { - __m128d v_X0d = _mm_set1_pd(X0); - __m128d v_Y0d = _mm_set1_pd(Y0); - __m128d v_W0 = _mm_set1_pd(W0); - __m128d v_x1 = _mm_set_pd(1, 0); - - for( ; x1 <= bw - 16; x1 += 16 ) - { - // 0-3 - __m128i v_X0, v_Y0; - { - __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); - __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); - __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); - v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); - } - - // 4-8 - __m128i v_X1, v_Y1; - { - __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); - __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); - __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); - v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); - } - - // 8-11 - __m128i v_X2, v_Y2; - { - __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); - __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); - __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); - v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); - } - - // 12-15 - __m128i v_X3, v_Y3; - { - __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); - __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); - __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); - v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); - } - - // convert to 16s - v_X0 = _mm_packs_epi32(v_X0, v_X1); - v_X1 = _mm_packs_epi32(v_X2, v_X3); - v_Y0 = _mm_packs_epi32(v_Y0, v_Y1); - v_Y1 = _mm_packs_epi32(v_Y2, v_Y3); - - _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1); - - _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1); - } - } + #if CV_TRY_SSE4_1 + if (pwarp_impl_sse4) + pwarp_impl_sse4->processNN(M, xy, X0, Y0, W0, bw); + else #endif - for( ; x1 < bw; x1++ ) { double W = W0 + M[6]*x1; @@ -6295,129 +6044,11 @@ public: short* alpha = A + y1*bw; x1 = 0; - #if CV_SSE4_1 - if (haveSSE4_1) - { - __m128d v_X0d = _mm_set1_pd(X0); - __m128d v_Y0d = _mm_set1_pd(Y0); - __m128d v_W0 = _mm_set1_pd(W0); - __m128d v_x1 = _mm_set_pd(1, 0); - - for( ; x1 <= bw - 16; x1 += 16 ) - { - // 0-3 - __m128i v_X0, v_Y0; - { - __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); - __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); - __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); - v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); - } - - // 4-8 - __m128i v_X1, v_Y1; - { - __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); - __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); - __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); - v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); - } - - // 8-11 - __m128i v_X2, v_Y2; - { - __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); - __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); - __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); - v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); - } - - // 12-15 - __m128i v_X3, v_Y3; - { - __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); - __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); - v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); - __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); - __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); - v_x1 = _mm_add_pd(v_x1, v_2); - - v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); - v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), - _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); - } - - // store alpha - __m128i v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y0, v_itsi1), INTER_BITS), - _mm_and_si128(v_X0, v_itsi1)); - __m128i v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y1, v_itsi1), INTER_BITS), - _mm_and_si128(v_X1, v_itsi1)); - _mm_storeu_si128((__m128i *)(alpha + x1), _mm_packs_epi32(v_alpha0, v_alpha1)); - - v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y2, v_itsi1), INTER_BITS), - _mm_and_si128(v_X2, v_itsi1)); - v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y3, v_itsi1), INTER_BITS), - _mm_and_si128(v_X3, v_itsi1)); - _mm_storeu_si128((__m128i *)(alpha + x1 + 8), _mm_packs_epi32(v_alpha0, v_alpha1)); - - // convert to 16s - v_X0 = _mm_packs_epi32(_mm_srai_epi32(v_X0, INTER_BITS), _mm_srai_epi32(v_X1, INTER_BITS)); - v_X1 = _mm_packs_epi32(_mm_srai_epi32(v_X2, INTER_BITS), _mm_srai_epi32(v_X3, INTER_BITS)); - v_Y0 = _mm_packs_epi32(_mm_srai_epi32(v_Y0, INTER_BITS), _mm_srai_epi32(v_Y1, INTER_BITS)); - v_Y1 = _mm_packs_epi32(_mm_srai_epi32(v_Y2, INTER_BITS), _mm_srai_epi32(v_Y3, INTER_BITS)); - - _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1); - - _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1); - } - } + #if CV_TRY_SSE4_1 + if (pwarp_impl_sse4) + pwarp_impl_sse4->process(M, xy, alpha, X0, Y0, W0, bw); + else #endif - for( ; x1 < bw; x1++ ) { double W = W0 + M[6]*x1; diff --git a/modules/imgproc/src/imgwarp.hpp b/modules/imgproc/src/imgwarp.hpp index 428bcc51e7..ed1146dcd3 100644 --- a/modules/imgproc/src/imgwarp.hpp +++ b/modules/imgproc/src/imgwarp.hpp @@ -61,11 +61,27 @@ void resizeNN4_AVX2(const Range&, const Mat&, Mat&, int*, int, double); int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw); #endif } -namespace opt_SSE41 + +namespace opt_SSE4_1 { #if CV_TRY_SSE4_1 void resizeNN2_SSE4_1(const Range&, const Mat&, Mat&, int*, int, double); void resizeNN4_SSE4_1(const Range&, const Mat&, Mat&, int*, int, double); + +int VResizeLanczos4Vec_32f16u_SSE41(const uchar** _src, uchar* _dst, const uchar* _beta, int width); +void convertMaps_nninterpolate32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, int width); +void convertMaps_32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, ushort* dst2, int width); +void convertMaps_32f2c16s_SSE41(const float* src1f, short* dst1, ushort* dst2, int width); +void WarpAffineInvoker_Blockline_SSE41(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw); + +class WarpPerspectiveLine_SSE4 +{ +public: + static Ptr getImpl(const double *M); + virtual void processNN(const double *M, short* xy, double X0, double Y0, double W0, int bw) = 0; + virtual void process(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw) = 0; + virtual ~WarpPerspectiveLine_SSE4() {}; +}; #endif } } diff --git a/modules/imgproc/src/imgwarp.sse4_1.cpp b/modules/imgproc/src/imgwarp.sse4_1.cpp index b326ffcb1a..79137d1230 100644 --- a/modules/imgproc/src/imgwarp.sse4_1.cpp +++ b/modules/imgproc/src/imgwarp.sse4_1.cpp @@ -52,7 +52,7 @@ namespace cv { -namespace opt_SSE41 +namespace opt_SSE4_1 { class resizeNNInvokerSSE2 : @@ -186,7 +186,494 @@ void resizeNN4_SSE4_1(const Range& range, const Mat& src, Mat &dst, int *x_ofs, parallel_for_(range, invoker, dst.total() / (double)(1 << 16)); } +int VResizeLanczos4Vec_32f16u_SSE41(const uchar** _src, uchar* _dst, const uchar* _beta, int width) +{ + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], + *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; + short * dst = (short*)_dst; + int x = 0; + __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]), + v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]), + v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]), + v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]); + + for (; x <= width - 8; x += 8) + { + __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x)); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x))); + + __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4)); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4))); + + __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0); + __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(v_dsti0, v_dsti1)); + } + + return x; } + +void convertMaps_nninterpolate32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, int width) +{ + int x = 0; + for (; x <= width - 16; x += 16) + { + __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)), + _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4))); + __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 8)), + _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 12))); + + __m128i v_dst2 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x)), + _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 4))); + __m128i v_dst3 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 8)), + _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 12))); + + _mm_interleave_epi16(v_dst0, v_dst1, v_dst2, v_dst3); + + _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst0); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst1); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst2); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst3); + } + + for (; x < width; x++) + { + dst1[x * 2] = saturate_cast(src1f[x]); + dst1[x * 2 + 1] = saturate_cast(src2f[x]); + } +} + +void convertMaps_32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, ushort* dst2, int width) +{ + int x = 0; + __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE); + __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE - 1); + + for (; x <= width - 16; x += 16) + { + __m128i v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x), v_its)); + __m128i v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 4), v_its)); + __m128i v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x), v_its)); + __m128i v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 4), v_its)); + + __m128i v_dst10 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS), + _mm_srai_epi32(v_ix1, INTER_BITS)); + __m128i v_dst12 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS), + _mm_srai_epi32(v_iy1, INTER_BITS)); + __m128i v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS), + _mm_and_si128(v_ix0, v_its1)); + __m128i v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS), + _mm_and_si128(v_ix1, v_its1)); + _mm_storeu_si128((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst20, v_dst21)); + + v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 8), v_its)); + v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 12), v_its)); + v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 8), v_its)); + v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 12), v_its)); + + __m128i v_dst11 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS), + _mm_srai_epi32(v_ix1, INTER_BITS)); + __m128i v_dst13 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS), + _mm_srai_epi32(v_iy1, INTER_BITS)); + v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS), + _mm_and_si128(v_ix0, v_its1)); + v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS), + _mm_and_si128(v_ix1, v_its1)); + _mm_storeu_si128((__m128i *)(dst2 + x + 8), _mm_packus_epi32(v_dst20, v_dst21)); + + _mm_interleave_epi16(v_dst10, v_dst11, v_dst12, v_dst13); + + _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst10); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst11); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst12); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst13); + } + for (; x < width; x++) + { + int ix = saturate_cast(src1f[x] * INTER_TAB_SIZE); + int iy = saturate_cast(src2f[x] * INTER_TAB_SIZE); + dst1[x * 2] = saturate_cast(ix >> INTER_BITS); + dst1[x * 2 + 1] = saturate_cast(iy >> INTER_BITS); + dst2[x] = (ushort)((iy & (INTER_TAB_SIZE - 1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE - 1))); + } } +void convertMaps_32f2c16s_SSE41(const float* src1f, short* dst1, ushort* dst2, int width) +{ + int x = 0; + __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE); + __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE - 1); + __m128i v_y_mask = _mm_set1_epi32((INTER_TAB_SIZE - 1) << 16); + + for (; x <= width - 4; x += 4) + { + __m128i v_src0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2), v_its)); + __m128i v_src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2 + 4), v_its)); + + __m128i v_dst1 = _mm_packs_epi32(_mm_srai_epi32(v_src0, INTER_BITS), + _mm_srai_epi32(v_src1, INTER_BITS)); + _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst1); + + // x0 y0 x1 y1 . . . + v_src0 = _mm_packs_epi32(_mm_and_si128(v_src0, v_its1), + _mm_and_si128(v_src1, v_its1)); + __m128i v_dst2 = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(v_src0, v_y_mask), 16 - INTER_BITS), // y0 0 y1 0 . . . + _mm_and_si128(v_src0, v_its1)); // 0 x0 0 x1 . . . + _mm_storel_epi64((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst2, v_dst2)); + } + for (; x < width; x++) + { + int ix = saturate_cast(src1f[x * 2] * INTER_TAB_SIZE); + int iy = saturate_cast(src1f[x * 2 + 1] * INTER_TAB_SIZE); + dst1[x * 2] = saturate_cast(ix >> INTER_BITS); + dst1[x * 2 + 1] = saturate_cast(iy >> INTER_BITS); + dst2[x] = (ushort)((iy & (INTER_TAB_SIZE - 1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE - 1))); + } +} + +void WarpAffineInvoker_Blockline_SSE41(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw) +{ + const int AB_BITS = MAX(10, (int)INTER_BITS); + int x1 = 0; + + __m128i v_X0 = _mm_set1_epi32(X0); + __m128i v_Y0 = _mm_set1_epi32(Y0); + for (; x1 <= bw - 16; x1 += 16) + { + __m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1))), AB_BITS), + _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 4))), AB_BITS)); + __m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 8))), AB_BITS), + _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 12))), AB_BITS)); + + __m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1))), AB_BITS), + _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 4))), AB_BITS)); + __m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 8))), AB_BITS), + _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 12))), AB_BITS)); + + _mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1); + + _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1); + } + for (; x1 < bw; x1++) + { + int X = (X0 + adelta[x1]) >> AB_BITS; + int Y = (Y0 + bdelta[x1]) >> AB_BITS; + xy[x1 * 2] = saturate_cast(X); + xy[x1 * 2 + 1] = saturate_cast(Y); + } +} + + +class WarpPerspectiveLine_SSE4_Impl: public WarpPerspectiveLine_SSE4 +{ +public: + WarpPerspectiveLine_SSE4_Impl(const double *M) + { + v_M0 = _mm_set1_pd(M[0]); + v_M3 = _mm_set1_pd(M[3]); + v_M6 = _mm_set1_pd(M[6]); + v_intmax = _mm_set1_pd((double)INT_MAX); + v_intmin = _mm_set1_pd((double)INT_MIN); + v_2 = _mm_set1_pd(2); + v_zero = _mm_setzero_pd(); + v_1 = _mm_set1_pd(1); + v_its = _mm_set1_pd(INTER_TAB_SIZE); + v_itsi1 = _mm_set1_epi32(INTER_TAB_SIZE - 1); + } + virtual void processNN(const double *M, short* xy, double X0, double Y0, double W0, int bw) + { + int x1 = 0; + __m128d v_X0d = _mm_set1_pd(X0); + __m128d v_Y0d = _mm_set1_pd(Y0); + __m128d v_W0 = _mm_set1_pd(W0); + __m128d v_x1 = _mm_set_pd(1, 0); + + for (; x1 <= bw - 16; x1 += 16) + { + // 0-3 + __m128i v_X0, v_Y0; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // 4-8 + __m128i v_X1, v_Y1; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // 8-11 + __m128i v_X2, v_Y2; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // 12-15 + __m128i v_X3, v_Y3; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // convert to 16s + v_X0 = _mm_packs_epi32(v_X0, v_X1); + v_X1 = _mm_packs_epi32(v_X2, v_X3); + v_Y0 = _mm_packs_epi32(v_Y0, v_Y1); + v_Y1 = _mm_packs_epi32(v_Y2, v_Y3); + + _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1); + + _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1); + } + + for (; x1 < bw; x1++) + { + double W = W0 + M[6] * x1; + W = W ? 1. / W : 0; + double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1)*W)); + double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1)*W)); + int X = saturate_cast(fX); + int Y = saturate_cast(fY); + + xy[x1 * 2] = saturate_cast(X); + xy[x1 * 2 + 1] = saturate_cast(Y); + } + } + virtual void process(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw) + { + int x1 = 0; + + __m128d v_X0d = _mm_set1_pd(X0); + __m128d v_Y0d = _mm_set1_pd(Y0); + __m128d v_W0 = _mm_set1_pd(W0); + __m128d v_x1 = _mm_set_pd(1, 0); + + for (; x1 <= bw - 16; x1 += 16) + { + // 0-3 + __m128i v_X0, v_Y0; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // 4-8 + __m128i v_X1, v_Y1; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // 8-11 + __m128i v_X2, v_Y2; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // 12-15 + __m128i v_X3, v_Y3; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // store alpha + __m128i v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y0, v_itsi1), INTER_BITS), + _mm_and_si128(v_X0, v_itsi1)); + __m128i v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y1, v_itsi1), INTER_BITS), + _mm_and_si128(v_X1, v_itsi1)); + _mm_storeu_si128((__m128i *)(alpha + x1), _mm_packs_epi32(v_alpha0, v_alpha1)); + + v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y2, v_itsi1), INTER_BITS), + _mm_and_si128(v_X2, v_itsi1)); + v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y3, v_itsi1), INTER_BITS), + _mm_and_si128(v_X3, v_itsi1)); + _mm_storeu_si128((__m128i *)(alpha + x1 + 8), _mm_packs_epi32(v_alpha0, v_alpha1)); + + // convert to 16s + v_X0 = _mm_packs_epi32(_mm_srai_epi32(v_X0, INTER_BITS), _mm_srai_epi32(v_X1, INTER_BITS)); + v_X1 = _mm_packs_epi32(_mm_srai_epi32(v_X2, INTER_BITS), _mm_srai_epi32(v_X3, INTER_BITS)); + v_Y0 = _mm_packs_epi32(_mm_srai_epi32(v_Y0, INTER_BITS), _mm_srai_epi32(v_Y1, INTER_BITS)); + v_Y1 = _mm_packs_epi32(_mm_srai_epi32(v_Y2, INTER_BITS), _mm_srai_epi32(v_Y3, INTER_BITS)); + + _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1); + + _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1); + } + for (; x1 < bw; x1++) + { + double W = W0 + M[6] * x1; + W = W ? INTER_TAB_SIZE / W : 0; + double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1)*W)); + double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1)*W)); + int X = saturate_cast(fX); + int Y = saturate_cast(fY); + + xy[x1 * 2] = saturate_cast(X >> INTER_BITS); + xy[x1 * 2 + 1] = saturate_cast(Y >> INTER_BITS); + alpha[x1] = (short)((Y & (INTER_TAB_SIZE - 1))*INTER_TAB_SIZE + + (X & (INTER_TAB_SIZE - 1))); + } + } + virtual ~WarpPerspectiveLine_SSE4_Impl() {}; +private: + __m128d v_M0; + __m128d v_M3; + __m128d v_M6; + __m128d v_intmax; + __m128d v_intmin; + __m128d v_2, + v_zero, + v_1, + v_its; + __m128i v_itsi1; +}; + +Ptr WarpPerspectiveLine_SSE4::getImpl(const double *M) +{ + return Ptr(new WarpPerspectiveLine_SSE4_Impl(M)); +} + +} +} /* End of file. */