improve RowVec_8u32s().

pull/6924/head
k-shinotsuka 8 years ago
parent 69c4e84dad
commit 6f8695c178
  1. 48
      modules/imgproc/src/filter.cpp

@ -506,56 +506,52 @@ struct RowVec_8u32s
if( smallValues ) if( smallValues )
{ {
for( ; i <= width - 16; i += 16 ) __m128i z = _mm_setzero_si128();
for( ; i <= width - 8; i += 8 )
{ {
const uchar* src = _src + i; const uchar* src = _src + i;
__m128i f, z = _mm_setzero_si128(), s0 = z, s1 = z, s2 = z, s3 = z; __m128i s0 = z, s1 = z;
__m128i x0, x1, x2, x3;
for( k = 0; k < _ksize; k++, src += cn ) for( k = 0; k < _ksize; k++, src += cn )
{ {
f = _mm_cvtsi32_si128(_kx[k]); __m128i f = _mm_cvtsi32_si128(_kx[k]);
f = _mm_shuffle_epi32(f, 0); f = _mm_shuffle_epi32(f, 0);
f = _mm_packs_epi32(f, f);
x0 = _mm_loadu_si128((const __m128i*)src); __m128i x0 = _mm_loadl_epi64((const __m128i*)src);
x2 = _mm_unpackhi_epi8(x0, z);
x0 = _mm_unpacklo_epi8(x0, z); x0 = _mm_unpacklo_epi8(x0, z);
x1 = _mm_mulhi_epi16(x0, f);
x3 = _mm_mulhi_epi16(x2, f); __m128i x1 = _mm_unpackhi_epi16(x0, z);
x0 = _mm_mullo_epi16(x0, f); x0 = _mm_unpacklo_epi16(x0, z);
x2 = _mm_mullo_epi16(x2, f);
x0 = _mm_madd_epi16(x0, f);
s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1)); x1 = _mm_madd_epi16(x1, f);
s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(x0, x1));
s2 = _mm_add_epi32(s2, _mm_unpacklo_epi16(x2, x3)); s0 = _mm_add_epi32(s0, x0);
s3 = _mm_add_epi32(s3, _mm_unpackhi_epi16(x2, x3)); s1 = _mm_add_epi32(s1, x1);
} }
_mm_store_si128((__m128i*)(dst + i), s0); _mm_store_si128((__m128i*)(dst + i), s0);
_mm_store_si128((__m128i*)(dst + i + 4), s1); _mm_store_si128((__m128i*)(dst + i + 4), s1);
_mm_store_si128((__m128i*)(dst + i + 8), s2);
_mm_store_si128((__m128i*)(dst + i + 12), s3);
} }
for( ; i <= width - 4; i += 4 ) if( i <= width - 4 )
{ {
const uchar* src = _src + i; const uchar* src = _src + i;
__m128i f, z = _mm_setzero_si128(), s0 = z, x0, x1; __m128i s0 = z;
for( k = 0; k < _ksize; k++, src += cn ) for( k = 0; k < _ksize; k++, src += cn )
{ {
f = _mm_cvtsi32_si128(_kx[k]); __m128i f = _mm_cvtsi32_si128(_kx[k]);
f = _mm_shuffle_epi32(f, 0); f = _mm_shuffle_epi32(f, 0);
f = _mm_packs_epi32(f, f);
x0 = _mm_cvtsi32_si128(*(const int*)src); __m128i x0 = _mm_cvtsi32_si128(*(const int*)src);
x0 = _mm_unpacklo_epi8(x0, z); x0 = _mm_unpacklo_epi8(x0, z);
x1 = _mm_mulhi_epi16(x0, f); x0 = _mm_unpacklo_epi16(x0, z);
x0 = _mm_mullo_epi16(x0, f); x0 = _mm_madd_epi16(x0, f);
s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1)); s0 = _mm_add_epi32(s0, x0);
} }
_mm_store_si128((__m128i*)(dst + i), s0); _mm_store_si128((__m128i*)(dst + i), s0);
i += 4;
} }
} }
return i; return i;

Loading…
Cancel
Save