|
|
|
@ -2294,26 +2294,44 @@ cvtScale_<short, int, float>( const short* src, size_t sstep, |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
|
|
|
|
|
#if CV_SSE2 |
|
|
|
|
if(USE_SSE2)//~5X
|
|
|
|
|
#if CV_AVX2 |
|
|
|
|
if (USE_AVX2) |
|
|
|
|
{ |
|
|
|
|
__m256 scale256 = _mm256_set1_ps (scale); |
|
|
|
|
__m256 shift256 = _mm256_set1_ps (shift); |
|
|
|
|
__m256i zero = _mm256_setzero_si256(); |
|
|
|
|
for ( ; x <= size.width - 16; x += 16) |
|
|
|
|
{ |
|
|
|
|
__m128 scale128 = _mm_set1_ps (scale); |
|
|
|
|
__m128 shift128 = _mm_set1_ps (shift); |
|
|
|
|
for(; x <= size.width - 8; x += 8 ) |
|
|
|
|
{ |
|
|
|
|
__m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x)); |
|
|
|
|
__m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4)); |
|
|
|
|
__m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16)); |
|
|
|
|
__m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16)); |
|
|
|
|
rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128); |
|
|
|
|
rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128); |
|
|
|
|
r0 = _mm_cvtps_epi32(rf0); |
|
|
|
|
r1 = _mm_cvtps_epi32(rf1); |
|
|
|
|
|
|
|
|
|
_mm_storeu_si128((__m128i*)(dst + x), r0); |
|
|
|
|
_mm_storeu_si128((__m128i*)(dst + x + 4), r1); |
|
|
|
|
} |
|
|
|
|
__m256i v_src = _mm256_loadu_si256((__m256i const *)(src + x)); |
|
|
|
|
__m256i v_src_lo = _mm256_unpacklo_epi16(v_src, zero); |
|
|
|
|
__m256i v_src_hi = _mm256_unpackhi_epi16(v_src, zero); |
|
|
|
|
__m256 v_dst0 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (v_src_lo), scale256), shift256); |
|
|
|
|
__m256 v_dst1 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (v_src_hi), scale256), shift256); |
|
|
|
|
_mm256_storeu_si256 ((__m256i *)(dst + x), _mm256_cvtps_epi32(v_dst0)); |
|
|
|
|
_mm256_storeu_si256 ((__m256i *)(dst + x + 8), _mm256_cvtps_epi32(v_dst1)); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
#if CV_SSE2 |
|
|
|
|
if (USE_SSE2)//~5X
|
|
|
|
|
{ |
|
|
|
|
__m128 scale128 = _mm_set1_ps (scale); |
|
|
|
|
__m128 shift128 = _mm_set1_ps (shift); |
|
|
|
|
for(; x <= size.width - 8; x += 8 ) |
|
|
|
|
{ |
|
|
|
|
__m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x)); |
|
|
|
|
__m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4)); |
|
|
|
|
__m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16)); |
|
|
|
|
__m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16)); |
|
|
|
|
rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128); |
|
|
|
|
rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128); |
|
|
|
|
r0 = _mm_cvtps_epi32(rf0); |
|
|
|
|
r1 = _mm_cvtps_epi32(rf1); |
|
|
|
|
|
|
|
|
|
_mm_storeu_si128((__m128i*)(dst + x), r0); |
|
|
|
|
_mm_storeu_si128((__m128i*)(dst + x + 4), r1); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#elif CV_NEON |
|
|
|
|
float32x4_t v_shift = vdupq_n_f32(shift); |
|
|
|
|
for(; x <= size.width - 8; x += 8 ) |
|
|
|
@ -2330,24 +2348,6 @@ cvtScale_<short, int, float>( const short* src, size_t sstep, |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
//We will wait Haswell
|
|
|
|
|
/*
|
|
|
|
|
#if CV_AVX |
|
|
|
|
if(USE_AVX)//2X - bad variant
|
|
|
|
|
{ |
|
|
|
|
////TODO:AVX implementation (optimization?) required
|
|
|
|
|
__m256 scale256 = _mm256_set1_ps (scale); |
|
|
|
|
__m256 shift256 = _mm256_set1_ps (shift); |
|
|
|
|
for(; x <= size.width - 8; x += 8 ) |
|
|
|
|
{ |
|
|
|
|
__m256i buf = _mm256_set_epi32((int)(*(src+x+7)),(int)(*(src+x+6)),(int)(*(src+x+5)),(int)(*(src+x+4)),(int)(*(src+x+3)),(int)(*(src+x+2)),(int)(*(src+x+1)),(int)(*(src+x))); |
|
|
|
|
__m256 r0 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (buf), scale256), shift256); |
|
|
|
|
__m256i res = _mm256_cvtps_epi32(r0); |
|
|
|
|
_mm256_storeu_si256 ((__m256i*)(dst+x), res); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#endif*/ |
|
|
|
|
|
|
|
|
|
for(; x < size.width; x++ ) |
|
|
|
|
dst[x] = saturate_cast<int>(src[x]*scale + shift); |
|
|
|
|
} |
|
|
|
|