|
|
|
@ -1569,7 +1569,239 @@ struct cvtScale_SIMD |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
#if CV_NEON |
|
|
|
|
#if CV_SSE2 |
|
|
|
|
|
|
|
|
|
template <> |
|
|
|
|
struct cvtScale_SIMD<uchar, uchar, float> |
|
|
|
|
{ |
|
|
|
|
int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
|
|
|
|
|
if (!USE_SSE2) |
|
|
|
|
return x; |
|
|
|
|
|
|
|
|
|
__m128i v_zero = _mm_setzero_si128(); |
|
|
|
|
__m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); |
|
|
|
|
|
|
|
|
|
for ( ; x <= width - 8; x += 8) |
|
|
|
|
{ |
|
|
|
|
__m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); |
|
|
|
|
__m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); |
|
|
|
|
__m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); |
|
|
|
|
|
|
|
|
|
v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); |
|
|
|
|
__m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); |
|
|
|
|
|
|
|
|
|
__m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), |
|
|
|
|
_mm_cvtps_epi32(v_dst_1)); |
|
|
|
|
_mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
template <> |
|
|
|
|
struct cvtScale_SIMD<uchar, schar, float> |
|
|
|
|
{ |
|
|
|
|
int operator () (const uchar * src, schar * dst, int width, float scale, float shift) const |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
|
|
|
|
|
if (!USE_SSE2) |
|
|
|
|
return x; |
|
|
|
|
|
|
|
|
|
__m128i v_zero = _mm_setzero_si128(); |
|
|
|
|
__m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); |
|
|
|
|
|
|
|
|
|
for ( ; x <= width - 8; x += 8) |
|
|
|
|
{ |
|
|
|
|
__m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); |
|
|
|
|
__m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); |
|
|
|
|
__m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); |
|
|
|
|
|
|
|
|
|
v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); |
|
|
|
|
__m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); |
|
|
|
|
|
|
|
|
|
__m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), |
|
|
|
|
_mm_cvtps_epi32(v_dst_1)); |
|
|
|
|
_mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
#if CV_SSE4_1 |
|
|
|
|
|
|
|
|
|
template <> |
|
|
|
|
struct cvtScale_SIMD<uchar, ushort, float> |
|
|
|
|
{ |
|
|
|
|
cvtScale_SIMD() |
|
|
|
|
{ |
|
|
|
|
haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
int operator () (const uchar * src, ushort * dst, int width, float scale, float shift) const |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
|
|
|
|
|
if (!haveSSE) |
|
|
|
|
return x; |
|
|
|
|
|
|
|
|
|
__m128i v_zero = _mm_setzero_si128(); |
|
|
|
|
__m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); |
|
|
|
|
|
|
|
|
|
for ( ; x <= width - 8; x += 8) |
|
|
|
|
{ |
|
|
|
|
__m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); |
|
|
|
|
__m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); |
|
|
|
|
__m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); |
|
|
|
|
|
|
|
|
|
v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); |
|
|
|
|
__m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); |
|
|
|
|
|
|
|
|
|
__m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), |
|
|
|
|
_mm_cvtps_epi32(v_dst_1)); |
|
|
|
|
_mm_storeu_si128((__m128i *)(dst + x), v_dst); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
bool haveSSE; |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
template <> |
|
|
|
|
struct cvtScale_SIMD<uchar, short, float> |
|
|
|
|
{ |
|
|
|
|
int operator () (const uchar * src, short * dst, int width, float scale, float shift) const |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
|
|
|
|
|
if (!USE_SSE2) |
|
|
|
|
return x; |
|
|
|
|
|
|
|
|
|
__m128i v_zero = _mm_setzero_si128(); |
|
|
|
|
__m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); |
|
|
|
|
|
|
|
|
|
for ( ; x <= width - 8; x += 8) |
|
|
|
|
{ |
|
|
|
|
__m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); |
|
|
|
|
__m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); |
|
|
|
|
__m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); |
|
|
|
|
|
|
|
|
|
v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); |
|
|
|
|
__m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); |
|
|
|
|
|
|
|
|
|
__m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), |
|
|
|
|
_mm_cvtps_epi32(v_dst_1)); |
|
|
|
|
_mm_storeu_si128((__m128i *)(dst + x), v_dst); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
template <> |
|
|
|
|
struct cvtScale_SIMD<uchar, int, float> |
|
|
|
|
{ |
|
|
|
|
int operator () (const uchar * src, int * dst, int width, float scale, float shift) const |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
|
|
|
|
|
if (!USE_SSE2) |
|
|
|
|
return x; |
|
|
|
|
|
|
|
|
|
__m128i v_zero = _mm_setzero_si128(); |
|
|
|
|
__m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); |
|
|
|
|
|
|
|
|
|
for ( ; x <= width - 8; x += 8) |
|
|
|
|
{ |
|
|
|
|
__m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); |
|
|
|
|
__m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); |
|
|
|
|
__m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); |
|
|
|
|
|
|
|
|
|
v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); |
|
|
|
|
__m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); |
|
|
|
|
|
|
|
|
|
_mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0)); |
|
|
|
|
_mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
template <> |
|
|
|
|
struct cvtScale_SIMD<uchar, float, float> |
|
|
|
|
{ |
|
|
|
|
int operator () (const uchar * src, float * dst, int width, float scale, float shift) const |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
|
|
|
|
|
if (!USE_SSE2) |
|
|
|
|
return x; |
|
|
|
|
|
|
|
|
|
__m128i v_zero = _mm_setzero_si128(); |
|
|
|
|
__m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); |
|
|
|
|
|
|
|
|
|
for ( ; x <= width - 8; x += 8) |
|
|
|
|
{ |
|
|
|
|
__m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); |
|
|
|
|
__m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); |
|
|
|
|
__m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); |
|
|
|
|
|
|
|
|
|
v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); |
|
|
|
|
__m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); |
|
|
|
|
|
|
|
|
|
_mm_storeu_ps(dst + x, v_dst_0); |
|
|
|
|
_mm_storeu_ps(dst + x + 4, v_dst_1); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
template <> |
|
|
|
|
struct cvtScale_SIMD<uchar, double, float> |
|
|
|
|
{ |
|
|
|
|
int operator () (const uchar * src, double * dst, int width, float scale, float shift) const |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
|
|
|
|
|
if (!USE_SSE2) |
|
|
|
|
return x; |
|
|
|
|
|
|
|
|
|
__m128i v_zero = _mm_setzero_si128(); |
|
|
|
|
__m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); |
|
|
|
|
|
|
|
|
|
for ( ; x <= width - 8; x += 8) |
|
|
|
|
{ |
|
|
|
|
__m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); |
|
|
|
|
__m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); |
|
|
|
|
__m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); |
|
|
|
|
|
|
|
|
|
v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); |
|
|
|
|
__m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); |
|
|
|
|
|
|
|
|
|
_mm_storeu_pd(dst + x, _mm_cvtps_pd(v_dst_0)); |
|
|
|
|
_mm_storeu_pd(dst + x + 4, _mm_cvtps_pd(_mm_castsi128_ps( |
|
|
|
|
_mm_srli_si128(_mm_castps_si128(v_dst_0), 16)))); |
|
|
|
|
|
|
|
|
|
_mm_storeu_pd(dst + x + 8, _mm_cvtps_pd(v_dst_1)); |
|
|
|
|
_mm_storeu_pd(dst + x + 12, _mm_cvtps_pd(_mm_castsi128_ps( |
|
|
|
|
_mm_srli_si128(_mm_castps_si128(v_dst_1), 16)))); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
#elif CV_NEON |
|
|
|
|
|
|
|
|
|
// from uchar
|
|
|
|
|
|
|
|
|
|