|
|
|
@ -619,28 +619,28 @@ cvtScale_<short, short, float>( const short* src, size_t sstep, |
|
|
|
|
sstep /= sizeof(src[0]); |
|
|
|
|
dstep /= sizeof(dst[0]); |
|
|
|
|
|
|
|
|
|
for( ; size.height--; src += sstep, dst += dstep ) |
|
|
|
|
for( ; size.height--; src += sstep, dst += dstep ) |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
#if CV_SSE2 |
|
|
|
|
if(USE_SSE2) |
|
|
|
|
#if CV_SSE2 |
|
|
|
|
if(USE_SSE2) |
|
|
|
|
{ |
|
|
|
|
__m128 scale128 = _mm_set1_ps (scale); |
|
|
|
|
__m128 shift128 = _mm_set1_ps (shift); |
|
|
|
|
for(; x <= size.width - 8; x += 8 ) |
|
|
|
|
{ |
|
|
|
|
__m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x)); |
|
|
|
|
for(; x <= size.width - 8; x += 8 ) |
|
|
|
|
{ |
|
|
|
|
__m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x)); |
|
|
|
|
__m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4)); |
|
|
|
|
__m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16)); |
|
|
|
|
__m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16)); |
|
|
|
|
__m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16)); |
|
|
|
|
rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128); |
|
|
|
|
rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128); |
|
|
|
|
r0 = _mm_cvtps_epi32(rf0); |
|
|
|
|
r1 = _mm_cvtps_epi32(rf1); |
|
|
|
|
r0 = _mm_packs_epi32(r0, r1); |
|
|
|
|
_mm_storeu_si128((__m128i*)(dst + x), r0); |
|
|
|
|
}
|
|
|
|
|
} |
|
|
|
|
r0 = _mm_cvtps_epi32(rf0); |
|
|
|
|
r1 = _mm_cvtps_epi32(rf1); |
|
|
|
|
r0 = _mm_packs_epi32(r0, r1); |
|
|
|
|
_mm_storeu_si128((__m128i*)(dst + x), r0); |
|
|
|
|
}
|
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
for(; x < size.width; x++ ) |
|
|
|
@ -659,17 +659,17 @@ cvt_( const T* src, size_t sstep, |
|
|
|
|
for( ; size.height--; src += sstep, dst += dstep ) |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
#if CV_ENABLE_UNROLLED |
|
|
|
|
for( ; x <= size.width - 4; x += 4 ) |
|
|
|
|
{ |
|
|
|
|
DT t0, t1; |
|
|
|
|
t0 = saturate_cast<DT>(src[x]); |
|
|
|
|
t1 = saturate_cast<DT>(src[x+1]); |
|
|
|
|
dst[x] = t0; dst[x+1] = t1; |
|
|
|
|
t0 = saturate_cast<DT>(src[x+2]); |
|
|
|
|
t1 = saturate_cast<DT>(src[x+3]); |
|
|
|
|
dst[x+2] = t0; dst[x+3] = t1; |
|
|
|
|
} |
|
|
|
|
#if CV_ENABLE_UNROLLED |
|
|
|
|
for( ; x <= size.width - 4; x += 4 ) |
|
|
|
|
{ |
|
|
|
|
DT t0, t1; |
|
|
|
|
t0 = saturate_cast<DT>(src[x]); |
|
|
|
|
t1 = saturate_cast<DT>(src[x+1]); |
|
|
|
|
dst[x] = t0; dst[x+1] = t1; |
|
|
|
|
t0 = saturate_cast<DT>(src[x+2]); |
|
|
|
|
t1 = saturate_cast<DT>(src[x+3]); |
|
|
|
|
dst[x+2] = t0; dst[x+3] = t1; |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
for( ; x < size.width; x++ ) |
|
|
|
|
dst[x] = saturate_cast<DT>(src[x]); |
|
|
|
@ -687,20 +687,20 @@ cvt_<float, short>( const float* src, size_t sstep, |
|
|
|
|
for( ; size.height--; src += sstep, dst += dstep ) |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
#if CV_SSE2 |
|
|
|
|
if(USE_SSE2){ |
|
|
|
|
for( ; x <= size.width - 8; x += 8 ) |
|
|
|
|
{ |
|
|
|
|
__m128 src128 = _mm_loadu_ps (src + x); |
|
|
|
|
__m128i src_int128 = _mm_cvtps_epi32 (src128); |
|
|
|
|
|
|
|
|
|
src128 = _mm_loadu_ps (src + x + 4);
|
|
|
|
|
__m128i src1_int128 = _mm_cvtps_epi32 (src128); |
|
|
|
|
|
|
|
|
|
src1_int128 = _mm_packs_epi32(src_int128, src1_int128); |
|
|
|
|
_mm_storeu_si128((__m128i*)(dst + x),src1_int128); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#if CV_SSE2 |
|
|
|
|
if(USE_SSE2){ |
|
|
|
|
for( ; x <= size.width - 8; x += 8 ) |
|
|
|
|
{ |
|
|
|
|
__m128 src128 = _mm_loadu_ps (src + x); |
|
|
|
|
__m128i src_int128 = _mm_cvtps_epi32 (src128); |
|
|
|
|
|
|
|
|
|
src128 = _mm_loadu_ps (src + x + 4);
|
|
|
|
|
__m128i src1_int128 = _mm_cvtps_epi32 (src128); |
|
|
|
|
|
|
|
|
|
src1_int128 = _mm_packs_epi32(src_int128, src1_int128); |
|
|
|
|
_mm_storeu_si128((__m128i*)(dst + x),src1_int128); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
for( ; x < size.width; x++ ) |
|
|
|
|
dst[x] = saturate_cast<short>(src[x]); |
|
|
|
|