|
|
|
@ -1265,47 +1265,72 @@ public: |
|
|
|
|
int dx = 0; |
|
|
|
|
const uchar* S0 = S; |
|
|
|
|
const uchar* S1 = S0 + step; |
|
|
|
|
__m128i masklow = _mm_set1_epi16(0x00ff); |
|
|
|
|
__m128i zero = _mm_setzero_si128(); |
|
|
|
|
__m128i delta2 = _mm_set1_epi16(2); |
|
|
|
|
|
|
|
|
|
if (cn == 1) |
|
|
|
|
{ |
|
|
|
|
for ( ; dx < w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) |
|
|
|
|
__m128i masklow = _mm_set1_epi16(0x00ff); |
|
|
|
|
for ( ; dx < w; dx += 8, S0 += 16, S1 += 16, D += 8) |
|
|
|
|
{ |
|
|
|
|
__m128i s0 = _mm_loadu_si128((const __m128i*)S0); |
|
|
|
|
__m128i s1 = _mm_loadu_si128((const __m128i*)S1); |
|
|
|
|
__m128i r0 = _mm_loadu_si128((const __m128i*)S0); |
|
|
|
|
__m128i r1 = _mm_loadu_si128((const __m128i*)S1); |
|
|
|
|
|
|
|
|
|
__m128i s = _mm_avg_epu8(s0, _mm_srli_si128(s0, 1)); |
|
|
|
|
s = _mm_avg_epu8(s, _mm_avg_epu8(s1, _mm_srli_si128(s1, 1))); |
|
|
|
|
__m128i s0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_and_si128(r0, masklow)); |
|
|
|
|
__m128i s1 = _mm_add_epi16(_mm_srli_epi16(r1, 8), _mm_and_si128(r1, masklow)); |
|
|
|
|
s0 = _mm_add_epi16(_mm_add_epi16(s0, s1), delta2); |
|
|
|
|
s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero); |
|
|
|
|
|
|
|
|
|
_mm_storel_epi64((__m128i*)D, _mm_packus_epi16(_mm_and_si128(s, masklow), zero)); |
|
|
|
|
_mm_storel_epi64((__m128i*)D, s0); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
else if (cn == 3) |
|
|
|
|
for ( ; dx < w - 6; dx += 6, S0 += 12, S1 += 12, D += 6) |
|
|
|
|
for ( ; dx < w; dx += 6, S0 += 12, S1 += 12, D += 6) |
|
|
|
|
{ |
|
|
|
|
__m128i s0 = _mm_loadu_si128((const __m128i*)S0); |
|
|
|
|
__m128i s1 = _mm_loadu_si128((const __m128i*)S1); |
|
|
|
|
|
|
|
|
|
__m128i s = _mm_avg_epu8(s0, _mm_srli_si128(s0, 3)); |
|
|
|
|
s = _mm_avg_epu8(s, _mm_avg_epu8(s1, _mm_srli_si128(s1, 3))); |
|
|
|
|
|
|
|
|
|
_mm_storel_epi64((__m128i*)D, s); |
|
|
|
|
_mm_storel_epi64((__m128i*)(D+3), _mm_srli_si128(s, 6)); |
|
|
|
|
__m128i r0 = _mm_loadu_si128((const __m128i*)S0); |
|
|
|
|
__m128i r1 = _mm_loadu_si128((const __m128i*)S1); |
|
|
|
|
|
|
|
|
|
__m128i r0_16l = _mm_unpacklo_epi8(r0, zero); |
|
|
|
|
__m128i r0_16h = _mm_unpacklo_epi8(_mm_srli_si128(r0, 6), zero); |
|
|
|
|
__m128i r1_16l = _mm_unpacklo_epi8(r1, zero); |
|
|
|
|
__m128i r1_16h = _mm_unpacklo_epi8(_mm_srli_si128(r1, 6), zero); |
|
|
|
|
|
|
|
|
|
__m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 6)); |
|
|
|
|
__m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 6)); |
|
|
|
|
s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); |
|
|
|
|
s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero); |
|
|
|
|
_mm_storel_epi64((__m128i*)D, s0); |
|
|
|
|
|
|
|
|
|
s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 6)); |
|
|
|
|
s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 6)); |
|
|
|
|
s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); |
|
|
|
|
s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero); |
|
|
|
|
_mm_storel_epi64((__m128i*)(D+3), s0); |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
{ |
|
|
|
|
CV_Assert(cn == 4); |
|
|
|
|
for ( ; dx < w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) |
|
|
|
|
for ( ; dx < w; dx += 8, S0 += 16, S1 += 16, D += 8) |
|
|
|
|
{ |
|
|
|
|
__m128i s0 = _mm_loadu_si128((const __m128i*)S0); |
|
|
|
|
__m128i s1 = _mm_loadu_si128((const __m128i*)S1); |
|
|
|
|
|
|
|
|
|
__m128i s = _mm_avg_epu8(s0, _mm_srli_si128(s0, 4)); |
|
|
|
|
s = _mm_avg_epu8(s, _mm_avg_epu8(s1, _mm_srli_si128(s1, 4))); |
|
|
|
|
|
|
|
|
|
_mm_storel_epi64((__m128i*)D, s); |
|
|
|
|
_mm_storel_epi64((__m128i*)(D+4), _mm_srli_si128(s, 8)); |
|
|
|
|
__m128i r0 = _mm_loadu_si128((const __m128i*)S0); |
|
|
|
|
__m128i r1 = _mm_loadu_si128((const __m128i*)S1); |
|
|
|
|
|
|
|
|
|
__m128i r0_16l = _mm_unpacklo_epi8(r0, zero); |
|
|
|
|
__m128i r0_16h = _mm_unpackhi_epi8(r0, zero); |
|
|
|
|
__m128i r1_16l = _mm_unpacklo_epi8(r1, zero); |
|
|
|
|
__m128i r1_16h = _mm_unpackhi_epi8(r1, zero); |
|
|
|
|
|
|
|
|
|
__m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 8)); |
|
|
|
|
__m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 8)); |
|
|
|
|
s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); |
|
|
|
|
s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero); |
|
|
|
|
_mm_storel_epi64((__m128i*)D, s0); |
|
|
|
|
|
|
|
|
|
s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 8)); |
|
|
|
|
s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 8)); |
|
|
|
|
s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); |
|
|
|
|
s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero); |
|
|
|
|
_mm_storel_epi64((__m128i*)(D+4), s0); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -1314,8 +1339,8 @@ public: |
|
|
|
|
|
|
|
|
|
private: |
|
|
|
|
int cn; |
|
|
|
|
int step; |
|
|
|
|
bool use_simd; |
|
|
|
|
int step; |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
class ResizeAreaFastVec_SIMD_16u |
|
|
|
@ -1337,45 +1362,58 @@ public: |
|
|
|
|
const ushort* S1 = (const ushort*)(S0 + step); |
|
|
|
|
__m128i masklow = _mm_set1_epi32(0x0000ffff); |
|
|
|
|
__m128i zero = _mm_setzero_si128(); |
|
|
|
|
__m128i delta2 = _mm_set1_epi32(2); |
|
|
|
|
|
|
|
|
|
if (cn == 1) |
|
|
|
|
{ |
|
|
|
|
for ( ; dx < w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) |
|
|
|
|
{ |
|
|
|
|
__m128i s0 = _mm_loadu_si128((const __m128i*)S0); |
|
|
|
|
__m128i s1 = _mm_loadu_si128((const __m128i*)S1); |
|
|
|
|
__m128i r0 = _mm_loadu_si128((const __m128i*)S0); |
|
|
|
|
__m128i r1 = _mm_loadu_si128((const __m128i*)S1); |
|
|
|
|
|
|
|
|
|
__m128i s = _mm_avg_epu16(s0, _mm_srli_si128(s0, 2)); |
|
|
|
|
s = _mm_avg_epu16(s, _mm_avg_epu16(s1, _mm_srli_si128(s1, 2))); |
|
|
|
|
__m128i s0 = _mm_add_epi32(_mm_srli_epi32(r0, 16), _mm_and_si128(r0, masklow)); |
|
|
|
|
__m128i s1 = _mm_add_epi32(_mm_srli_epi32(r1, 16), _mm_and_si128(r1, masklow)); |
|
|
|
|
s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2); |
|
|
|
|
s0 = _mm_packs_epi32(_mm_srli_epi32(s0, 2), zero); |
|
|
|
|
|
|
|
|
|
s = _mm_and_si128(s, masklow); |
|
|
|
|
s = _mm_packs_epi32(s, zero); |
|
|
|
|
_mm_storel_epi64((__m128i*)D, s); |
|
|
|
|
_mm_storel_epi64((__m128i*)D, s0); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
else if (cn == 3) |
|
|
|
|
for ( ; dx < w - 3; dx += 3, S0 += 6, S1 += 6, D += 3) |
|
|
|
|
{ |
|
|
|
|
__m128i s0 = _mm_loadu_si128((const __m128i*)S0); |
|
|
|
|
__m128i s1 = _mm_loadu_si128((const __m128i*)S1); |
|
|
|
|
|
|
|
|
|
__m128i s = _mm_avg_epu16(s0, _mm_srli_si128(s0, 6)); |
|
|
|
|
s = _mm_avg_epu16(s, _mm_avg_epu16(s1, _mm_srli_si128(s1, 6))); |
|
|
|
|
|
|
|
|
|
_mm_storel_epi64((__m128i*)D, s); |
|
|
|
|
__m128i r0 = _mm_loadu_si128((const __m128i*)S0); |
|
|
|
|
__m128i r1 = _mm_loadu_si128((const __m128i*)S1); |
|
|
|
|
|
|
|
|
|
__m128i r0_16l = _mm_unpacklo_epi16(r0, zero); |
|
|
|
|
__m128i r0_16h = _mm_unpacklo_epi16(_mm_srli_si128(r0, 6), zero); |
|
|
|
|
__m128i r1_16l = _mm_unpacklo_epi16(r1, zero); |
|
|
|
|
__m128i r1_16h = _mm_unpacklo_epi16(_mm_srli_si128(r1, 6), zero); |
|
|
|
|
|
|
|
|
|
__m128i s0 = _mm_add_epi16(r0_16l, r0_16h); |
|
|
|
|
__m128i s1 = _mm_add_epi16(r1_16l, r1_16h); |
|
|
|
|
s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2)); |
|
|
|
|
s0 = _mm_packs_epi32(_mm_srli_epi32(s0, 2), zero); |
|
|
|
|
_mm_storel_epi64((__m128i*)D, s0); |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
{ |
|
|
|
|
CV_Assert(cn == 4); |
|
|
|
|
for ( ; dx < w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) |
|
|
|
|
{ |
|
|
|
|
__m128i s0 = _mm_loadu_si128((const __m128i*)S0); |
|
|
|
|
__m128i s1 = _mm_loadu_si128((const __m128i*)S1); |
|
|
|
|
|
|
|
|
|
__m128i s = _mm_avg_epu16(s0, _mm_srli_si128(s0, 8)); |
|
|
|
|
s = _mm_avg_epu16(s, _mm_avg_epu16(s1, _mm_srli_si128(s1, 8))); |
|
|
|
|
|
|
|
|
|
_mm_storel_epi64((__m128i*)(D), s); |
|
|
|
|
__m128i r0 = _mm_loadu_si128((const __m128i*)S0); |
|
|
|
|
__m128i r1 = _mm_loadu_si128((const __m128i*)S1); |
|
|
|
|
|
|
|
|
|
__m128i r0_32l = _mm_unpacklo_epi16(r0, zero); |
|
|
|
|
__m128i r0_32h = _mm_unpackhi_epi16(r0, zero); |
|
|
|
|
__m128i r1_32l = _mm_unpacklo_epi16(r1, zero); |
|
|
|
|
__m128i r1_32h = _mm_unpackhi_epi16(r1, zero); |
|
|
|
|
|
|
|
|
|
__m128i s0 = _mm_add_epi32(r0_32l, r0_32h); |
|
|
|
|
__m128i s1 = _mm_add_epi32(r1_32l, r1_32h); |
|
|
|
|
s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2)); |
|
|
|
|
s0 = _mm_packs_epi32(_mm_srli_epi32(s0, 2), zero); |
|
|
|
|
_mm_storel_epi64((__m128i*)D, s0); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -1404,7 +1442,7 @@ struct ResizeAreaFastVec |
|
|
|
|
|
|
|
|
|
int operator() (const T* S, T* D, int w) const |
|
|
|
|
{ |
|
|
|
|
if( !fast_mode ) |
|
|
|
|
if (!fast_mode) |
|
|
|
|
return 0; |
|
|
|
|
|
|
|
|
|
const T* nextS = (const T*)((const uchar*)S + step); |
|
|
|
|