|
|
|
@ -915,6 +915,10 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) |
|
|
|
|
size_t src_step = _src.step / sizeof(src[0]); |
|
|
|
|
size_t dst_step = _dst.step / sizeof(dst[0]); |
|
|
|
|
|
|
|
|
|
#if CV_SSE2 |
|
|
|
|
volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE); |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
if (_src.isContinuous() && _dst.isContinuous()) |
|
|
|
|
{ |
|
|
|
|
roi.width *= roi.height; |
|
|
|
@ -927,6 +931,45 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) |
|
|
|
|
for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) |
|
|
|
|
{ |
|
|
|
|
j = 0; |
|
|
|
|
#if CV_SSE2 |
|
|
|
|
if( useSIMD ) |
|
|
|
|
{ |
|
|
|
|
__m128d thresh2 = _mm_set1_pd(thresh), maxval2 = _mm_set1_pd(maxval); |
|
|
|
|
for( ; j <= roi.width - 8; j += 8 ) |
|
|
|
|
{ |
|
|
|
|
__m128d v0, v1, v2, v3; |
|
|
|
|
v0 = _mm_loadu_pd( src + j ); |
|
|
|
|
v1 = _mm_loadu_pd( src + j + 2 ); |
|
|
|
|
v2 = _mm_loadu_pd( src + j + 4 ); |
|
|
|
|
v3 = _mm_loadu_pd( src + j + 6 ); |
|
|
|
|
v0 = _mm_cmpgt_pd( v0, thresh2 ); |
|
|
|
|
v1 = _mm_cmpgt_pd( v1, thresh2 ); |
|
|
|
|
v2 = _mm_cmpgt_pd( v2, thresh2 ); |
|
|
|
|
v3 = _mm_cmpgt_pd( v3, thresh2 ); |
|
|
|
|
v0 = _mm_and_pd( v0, maxval2 ); |
|
|
|
|
v1 = _mm_and_pd( v1, maxval2 ); |
|
|
|
|
v2 = _mm_and_pd( v2, maxval2 ); |
|
|
|
|
v3 = _mm_and_pd( v3, maxval2 ); |
|
|
|
|
_mm_storeu_pd( dst + j, v0 ); |
|
|
|
|
_mm_storeu_pd( dst + j + 2, v1 ); |
|
|
|
|
_mm_storeu_pd( dst + j + 4, v2 ); |
|
|
|
|
_mm_storeu_pd( dst + j + 6, v3 ); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#elif CV_NEON && defined(__aarch64__) |
|
|
|
|
float64x2_t v_thresh = vdupq_n_f64(thresh); |
|
|
|
|
uint64x2_t v_maxval = vreinterpretq_u64_f64(vdupq_n_f64(maxval)); |
|
|
|
|
|
|
|
|
|
for( ; j <= roi.width - 4; j += 4 ) |
|
|
|
|
{ |
|
|
|
|
float64x2_t v_src0 = vld1q_f64(src + j); |
|
|
|
|
float64x2_t v_src1 = vld1q_f64(src + j + 2); |
|
|
|
|
uint64x2_t v_dst0 = vandq_u64(vcgtq_f64(v_src0, v_thresh), v_maxval); |
|
|
|
|
uint64x2_t v_dst1 = vandq_u64(vcgtq_f64(v_src1, v_thresh), v_maxval); |
|
|
|
|
vst1q_f64(dst + j, vreinterpretq_f64_u64(v_dst0)); |
|
|
|
|
vst1q_f64(dst + j + 2, vreinterpretq_f64_u64(v_dst1)); |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
for (; j < roi.width; j++) |
|
|
|
|
dst[j] = src[j] > thresh ? maxval : 0; |
|
|
|
@ -938,6 +981,45 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) |
|
|
|
|
{ |
|
|
|
|
j = 0; |
|
|
|
|
|
|
|
|
|
#if CV_SSE2 |
|
|
|
|
if( useSIMD ) |
|
|
|
|
{ |
|
|
|
|
__m128d thresh2 = _mm_set1_pd(thresh), maxval2 = _mm_set1_pd(maxval); |
|
|
|
|
for( ; j <= roi.width - 8; j += 8 ) |
|
|
|
|
{ |
|
|
|
|
__m128d v0, v1, v2, v3; |
|
|
|
|
v0 = _mm_loadu_pd( src + j ); |
|
|
|
|
v1 = _mm_loadu_pd( src + j + 2 ); |
|
|
|
|
v2 = _mm_loadu_pd( src + j + 4 ); |
|
|
|
|
v3 = _mm_loadu_pd( src + j + 6 ); |
|
|
|
|
v0 = _mm_cmple_pd( v0, thresh2 ); |
|
|
|
|
v1 = _mm_cmple_pd( v1, thresh2 ); |
|
|
|
|
v2 = _mm_cmple_pd( v2, thresh2 ); |
|
|
|
|
v3 = _mm_cmple_pd( v3, thresh2 ); |
|
|
|
|
v0 = _mm_and_pd( v0, maxval2 ); |
|
|
|
|
v1 = _mm_and_pd( v1, maxval2 ); |
|
|
|
|
v2 = _mm_and_pd( v2, maxval2 ); |
|
|
|
|
v3 = _mm_and_pd( v3, maxval2 ); |
|
|
|
|
_mm_storeu_pd( dst + j, v0 ); |
|
|
|
|
_mm_storeu_pd( dst + j + 2, v1 ); |
|
|
|
|
_mm_storeu_pd( dst + j + 4, v2 ); |
|
|
|
|
_mm_storeu_pd( dst + j + 6, v3 ); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#elif CV_NEON && defined(__aarch64__) |
|
|
|
|
float64x2_t v_thresh = vdupq_n_f64(thresh); |
|
|
|
|
uint64x2_t v_maxval = vreinterpretq_u64_f64(vdupq_n_f64(maxval)); |
|
|
|
|
|
|
|
|
|
for( ; j <= roi.width - 4; j += 4 ) |
|
|
|
|
{ |
|
|
|
|
float64x2_t v_src0 = vld1q_f64(src + j); |
|
|
|
|
float64x2_t v_src1 = vld1q_f64(src + j + 2); |
|
|
|
|
uint64x2_t v_dst0 = vandq_u64(vcleq_f64(v_src0, v_thresh), v_maxval); |
|
|
|
|
uint64x2_t v_dst1 = vandq_u64(vcleq_f64(v_src1, v_thresh), v_maxval); |
|
|
|
|
vst1q_f64(dst + j, vreinterpretq_f64_u64(v_dst0)); |
|
|
|
|
vst1q_f64(dst + j + 2, vreinterpretq_f64_u64(v_dst1)); |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
for (; j < roi.width; j++) |
|
|
|
|
dst[j] = src[j] <= thresh ? maxval : 0; |
|
|
|
|
} |
|
|
|
@ -948,6 +1030,40 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) |
|
|
|
|
{ |
|
|
|
|
j = 0; |
|
|
|
|
|
|
|
|
|
#if CV_SSE2 |
|
|
|
|
if( useSIMD ) |
|
|
|
|
{ |
|
|
|
|
__m128d thresh2 = _mm_set1_pd(thresh); |
|
|
|
|
for( ; j <= roi.width - 8; j += 8 ) |
|
|
|
|
{ |
|
|
|
|
__m128d v0, v1, v2, v3; |
|
|
|
|
v0 = _mm_loadu_pd( src + j ); |
|
|
|
|
v1 = _mm_loadu_pd( src + j + 2 ); |
|
|
|
|
v2 = _mm_loadu_pd( src + j + 4 ); |
|
|
|
|
v3 = _mm_loadu_pd( src + j + 6 ); |
|
|
|
|
v0 = _mm_min_pd( v0, thresh2 ); |
|
|
|
|
v1 = _mm_min_pd( v1, thresh2 ); |
|
|
|
|
v2 = _mm_min_pd( v2, thresh2 ); |
|
|
|
|
v3 = _mm_min_pd( v3, thresh2 ); |
|
|
|
|
_mm_storeu_pd( dst + j, v0 ); |
|
|
|
|
_mm_storeu_pd( dst + j + 2, v1 ); |
|
|
|
|
_mm_storeu_pd( dst + j + 4, v2 ); |
|
|
|
|
_mm_storeu_pd( dst + j + 6, v3 ); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#elif CV_NEON && defined(__aarch64__) |
|
|
|
|
float64x2_t v_thresh = vdupq_n_f64(thresh); |
|
|
|
|
|
|
|
|
|
for( ; j <= roi.width - 4; j += 4 ) |
|
|
|
|
{ |
|
|
|
|
float64x2_t v_src0 = vld1q_f64(src + j); |
|
|
|
|
float64x2_t v_src1 = vld1q_f64(src + j + 2); |
|
|
|
|
float64x2_t v_dst0 = vminq_f64(v_src0, v_thresh); |
|
|
|
|
float64x2_t v_dst1 = vminq_f64(v_src1, v_thresh); |
|
|
|
|
vst1q_f64(dst + j, v_dst0); |
|
|
|
|
vst1q_f64(dst + j + 2, v_dst1); |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
for (; j < roi.width; j++) |
|
|
|
|
dst[j] = std::min(src[j], thresh); |
|
|
|
|
} |
|
|
|
@ -958,6 +1074,42 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) |
|
|
|
|
{ |
|
|
|
|
j = 0; |
|
|
|
|
|
|
|
|
|
#if CV_SSE2 |
|
|
|
|
if( useSIMD ) |
|
|
|
|
{ |
|
|
|
|
__m128d thresh2 = _mm_set1_pd(thresh); |
|
|
|
|
for( ; j <= roi.width - 8; j += 8 ) |
|
|
|
|
{ |
|
|
|
|
__m128d v0, v1, v2, v3; |
|
|
|
|
v0 = _mm_loadu_pd( src + j ); |
|
|
|
|
v1 = _mm_loadu_pd( src + j + 2 ); |
|
|
|
|
v2 = _mm_loadu_pd( src + j + 4 ); |
|
|
|
|
v3 = _mm_loadu_pd( src + j + 6 ); |
|
|
|
|
v0 = _mm_and_pd( v0, _mm_cmpgt_pd(v0, thresh2)); |
|
|
|
|
v1 = _mm_and_pd( v1, _mm_cmpgt_pd(v1, thresh2)); |
|
|
|
|
v2 = _mm_and_pd( v2, _mm_cmpgt_pd(v2, thresh2)); |
|
|
|
|
v3 = _mm_and_pd( v3, _mm_cmpgt_pd(v3, thresh2)); |
|
|
|
|
_mm_storeu_pd( dst + j, v0 ); |
|
|
|
|
_mm_storeu_pd( dst + j + 2, v1 ); |
|
|
|
|
_mm_storeu_pd( dst + j + 4, v2 ); |
|
|
|
|
_mm_storeu_pd( dst + j + 6, v3 ); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#elif CV_NEON && defined(__aarch64__) |
|
|
|
|
float64x2_t v_thresh = vdupq_n_f64(thresh); |
|
|
|
|
|
|
|
|
|
for( ; j <= roi.width - 4; j += 4 ) |
|
|
|
|
{ |
|
|
|
|
float64x2_t v_src0 = vld1q_f64(src + j); |
|
|
|
|
float64x2_t v_src1 = vld1q_f64(src + j + 2); |
|
|
|
|
uint64x2_t v_dst0 = vandq_u64(vcgtq_f64(v_src0, v_thresh), |
|
|
|
|
vreinterpretq_u64_f64(v_src0)); |
|
|
|
|
uint64x2_t v_dst1 = vandq_u64(vcgtq_f64(v_src1, v_thresh), |
|
|
|
|
vreinterpretq_u64_f64(v_src1)); |
|
|
|
|
vst1q_f64(dst + j, vreinterpretq_f64_u64(v_dst0)); |
|
|
|
|
vst1q_f64(dst + j + 2, vreinterpretq_f64_u64(v_dst1)); |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
for (; j < roi.width; j++) |
|
|
|
|
{ |
|
|
|
|
double v = src[j]; |
|
|
|
@ -971,6 +1123,42 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) |
|
|
|
|
{ |
|
|
|
|
j = 0; |
|
|
|
|
|
|
|
|
|
#if CV_SSE2 |
|
|
|
|
if( useSIMD ) |
|
|
|
|
{ |
|
|
|
|
__m128d thresh2 = _mm_set1_pd(thresh); |
|
|
|
|
for( ; j <= roi.width - 8; j += 8 ) |
|
|
|
|
{ |
|
|
|
|
__m128d v0, v1, v2, v3; |
|
|
|
|
v0 = _mm_loadu_pd( src + j ); |
|
|
|
|
v1 = _mm_loadu_pd( src + j + 2 ); |
|
|
|
|
v2 = _mm_loadu_pd( src + j + 4 ); |
|
|
|
|
v3 = _mm_loadu_pd( src + j + 6 ); |
|
|
|
|
v0 = _mm_and_pd( v0, _mm_cmple_pd(v0, thresh2)); |
|
|
|
|
v1 = _mm_and_pd( v1, _mm_cmple_pd(v1, thresh2)); |
|
|
|
|
v2 = _mm_and_pd( v2, _mm_cmple_pd(v2, thresh2)); |
|
|
|
|
v3 = _mm_and_pd( v3, _mm_cmple_pd(v3, thresh2)); |
|
|
|
|
_mm_storeu_pd( dst + j, v0 ); |
|
|
|
|
_mm_storeu_pd( dst + j + 2, v1 ); |
|
|
|
|
_mm_storeu_pd( dst + j + 4, v2 ); |
|
|
|
|
_mm_storeu_pd( dst + j + 6, v3 ); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#elif CV_NEON && defined(__aarch64__) |
|
|
|
|
float64x2_t v_thresh = vdupq_n_f64(thresh); |
|
|
|
|
|
|
|
|
|
for( ; j <= roi.width - 4; j += 4 ) |
|
|
|
|
{ |
|
|
|
|
float64x2_t v_src0 = vld1q_f64(src + j); |
|
|
|
|
float64x2_t v_src1 = vld1q_f64(src + j + 2); |
|
|
|
|
uint64x2_t v_dst0 = vandq_u64(vcleq_f64(v_src0, v_thresh), |
|
|
|
|
vreinterpretq_u64_f64(v_src0)); |
|
|
|
|
uint64x2_t v_dst1 = vandq_u64(vcleq_f64(v_src1, v_thresh), |
|
|
|
|
vreinterpretq_u64_f64(v_src1)); |
|
|
|
|
vst1q_f64(dst + j, vreinterpretq_f64_u64(v_dst0)); |
|
|
|
|
vst1q_f64(dst + j + 2, vreinterpretq_f64_u64(v_dst1)); |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
for (; j < roi.width; j++) |
|
|
|
|
{ |
|
|
|
|
double v = src[j]; |
|
|
|
|