diff --git a/modules/imgproc/src/thresh.cpp b/modules/imgproc/src/thresh.cpp index 9c6168916d..d724db4020 100644 --- a/modules/imgproc/src/thresh.cpp +++ b/modules/imgproc/src/thresh.cpp @@ -195,82 +195,78 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type ) int j = 0; const uchar* src = _src.ptr(); uchar* dst = _dst.ptr(); -#if CV_SIMD128 - bool useSIMD = checkHardwareSupport( CV_CPU_SSE2 ) || checkHardwareSupport( CV_CPU_NEON ); - if( useSIMD ) - { - v_uint8x16 thresh_u = v_setall_u8( thresh ); - v_uint8x16 maxval16 = v_setall_u8( maxval ); +#if CV_SIMD + v_uint8 thresh_u = vx_setall_u8( thresh ); + v_uint8 maxval16 = vx_setall_u8( maxval ); - switch( type ) + switch( type ) + { + case THRESH_BINARY: + for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { - case THRESH_BINARY: - for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes) { - for( j = 0; j <= roi.width - 16; j += 16 ) - { - v_uint8x16 v0; - v0 = v_load( src + j ); - v0 = thresh_u < v0; - v0 = v0 & maxval16; - v_store( dst + j, v0 ); - } + v_uint8 v0; + v0 = vx_load( src + j ); + v0 = thresh_u < v0; + v0 = v0 & maxval16; + v_store( dst + j, v0 ); } - break; + } + break; - case THRESH_BINARY_INV: - for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + case THRESH_BINARY_INV: + for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + { + for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes) { - for( j = 0; j <= roi.width - 16; j += 16 ) - { - v_uint8x16 v0; - v0 = v_load( src + j ); - v0 = v0 <= thresh_u; - v0 = v0 & maxval16; - v_store( dst + j, v0 ); - } + v_uint8 v0; + v0 = vx_load( src + j ); + v0 = v0 <= thresh_u; + v0 = v0 & maxval16; + v_store( dst + j, v0 ); } - break; + } + break; - case THRESH_TRUNC: - for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + case THRESH_TRUNC: + for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + { + for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes) { - for( j = 0; j <= roi.width - 16; j += 16 ) - { - v_uint8x16 v0; - v0 = v_load( src + j ); - v0 = v0 - ( v0 - thresh_u ); - v_store( dst + j, v0 ); - } + v_uint8 v0; + v0 = vx_load( src + j ); + v0 = v0 - ( v0 - thresh_u ); + v_store( dst + j, v0 ); } - break; + } + break; - case THRESH_TOZERO: - for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + case THRESH_TOZERO: + for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + { + for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes) { - for( j = 0; j <= roi.width - 16; j += 16 ) - { - v_uint8x16 v0; - v0 = v_load( src + j ); - v0 = ( thresh_u < v0 ) & v0; - v_store( dst + j, v0 ); - } + v_uint8 v0; + v0 = vx_load( src + j ); + v0 = ( thresh_u < v0 ) & v0; + v_store( dst + j, v0 ); } - break; + } + break; - case THRESH_TOZERO_INV: - for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + case THRESH_TOZERO_INV: + for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + { + for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes) { - for( j = 0; j <= roi.width - 16; j += 16 ) - { - v_uint8x16 v0; - v0 = v_load( src + j ); - v0 = ( v0 <= thresh_u ) & v0; - v_store( dst + j, v0 ); - } + v_uint8 v0; + v0 = vx_load( src + j ); + v0 = ( v0 <= thresh_u ) & v0; + v_store( dst + j, v0 ); } - break; } + break; } #endif @@ -362,125 +358,156 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type) const ushort* src = _src.ptr(); ushort* dst = _dst.ptr(); -#if CV_SIMD128 - bool useSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); - if (useSIMD) - { - int i, j; - v_uint16x8 thresh_u = v_setall_u16(thresh); - v_uint16x8 maxval16 = v_setall_u16(maxval); +#if CV_SIMD + int i, j; + v_uint16 thresh_u = vx_setall_u16(thresh); + v_uint16 maxval16 = vx_setall_u16(maxval); - switch (type) + switch (type) + { + case THRESH_BINARY: + for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) { - case THRESH_BINARY: - for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) + for (j = 0; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes) { - for (j = 0; j <= roi.width - 16; j += 16) - { - v_uint16x8 v0, v1; - v0 = v_load(src + j); - v1 = v_load(src + j + 8); - v0 = thresh_u < v0; - v1 = thresh_u < v1; - v0 = v0 & maxval16; - v1 = v1 & maxval16; - v_store(dst + j, v0); - v_store(dst + j + 8, v1); - } - - for (; j < roi.width; j++) - dst[j] = threshBinary(src[j], thresh, maxval); + v_uint16 v0, v1; + v0 = vx_load(src + j); + v1 = vx_load(src + j + v_uint16::nlanes); + v0 = thresh_u < v0; + v1 = thresh_u < v1; + v0 = v0 & maxval16; + v1 = v1 & maxval16; + v_store(dst + j, v0); + v_store(dst + j + v_uint16::nlanes, v1); } - break; - - case THRESH_BINARY_INV: - for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) + if (j <= roi.width - v_uint16::nlanes) { - j = 0; - for (; j <= roi.width - 16; j += 16) - { - v_uint16x8 v0, v1; - v0 = v_load(src + j); - v1 = v_load(src + j + 8); - v0 = v0 <= thresh_u; - v1 = v1 <= thresh_u; - v0 = v0 & maxval16; - v1 = v1 & maxval16; - v_store(dst + j, v0); - v_store(dst + j + 8, v1); - } - - for (; j < roi.width; j++) - dst[j] = threshBinaryInv(src[j], thresh, maxval); + v_uint16 v0 = vx_load(src + j); + v0 = thresh_u < v0; + v0 = v0 & maxval16; + v_store(dst + j, v0); + j += v_uint16::nlanes; } - break; - case THRESH_TRUNC: - for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) - { - j = 0; - for (; j <= roi.width - 16; j += 16) - { - v_uint16x8 v0, v1; - v0 = v_load(src + j); - v1 = v_load(src + j + 8); - v0 = v_min(v0, thresh_u); - v1 = v_min(v1, thresh_u); - v_store(dst + j, v0); - v_store(dst + j + 8, v1); - } + for (; j < roi.width; j++) + dst[j] = threshBinary(src[j], thresh, maxval); + } + break; - for (; j < roi.width; j++) - dst[j] = threshTrunc(src[j], thresh); + case THRESH_BINARY_INV: + for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) + { + j = 0; + for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes) + { + v_uint16 v0, v1; + v0 = vx_load(src + j); + v1 = vx_load(src + j + v_uint16::nlanes); + v0 = v0 <= thresh_u; + v1 = v1 <= thresh_u; + v0 = v0 & maxval16; + v1 = v1 & maxval16; + v_store(dst + j, v0); + v_store(dst + j + v_uint16::nlanes, v1); } - break; - - case THRESH_TOZERO: - for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) + if (j <= roi.width - v_uint16::nlanes) { - j = 0; - for (; j <= roi.width - 16; j += 16) - { - v_uint16x8 v0, v1; - v0 = v_load(src + j); - v1 = v_load(src + j + 8); - v0 = (thresh_u < v0) & v0; - v1 = (thresh_u < v1) & v1; - v_store(dst + j, v0); - v_store(dst + j + 8, v1); - } + v_uint16 v0 = vx_load(src + j); + v0 = v0 <= thresh_u; + v0 = v0 & maxval16; + v_store(dst + j, v0); + j += v_uint16::nlanes; + } - for (; j < roi.width; j++) - dst[j] = threshToZero(src[j], thresh); + for (; j < roi.width; j++) + dst[j] = threshBinaryInv(src[j], thresh, maxval); + } + break; + + case THRESH_TRUNC: + for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) + { + j = 0; + for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes) + { + v_uint16 v0, v1; + v0 = vx_load(src + j); + v1 = vx_load(src + j + v_uint16::nlanes); + v0 = v_min(v0, thresh_u); + v1 = v_min(v1, thresh_u); + v_store(dst + j, v0); + v_store(dst + j + v_uint16::nlanes, v1); + } + if (j <= roi.width - v_uint16::nlanes) + { + v_uint16 v0 = vx_load(src + j); + v0 = v_min(v0, thresh_u); + v_store(dst + j, v0); + j += v_uint16::nlanes; } - break; - case THRESH_TOZERO_INV: - for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) + for (; j < roi.width; j++) + dst[j] = threshTrunc(src[j], thresh); + } + break; + + case THRESH_TOZERO: + for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) + { + j = 0; + for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes) { - j = 0; - for (; j <= roi.width - 16; j += 16) - { - v_uint16x8 v0, v1; - v0 = v_load(src + j); - v1 = v_load(src + j + 8); - v0 = (v0 <= thresh_u) & v0; - v1 = (v1 <= thresh_u) & v1; - v_store(dst + j, v0); - v_store(dst + j + 8, v1); - } + v_uint16 v0, v1; + v0 = vx_load(src + j); + v1 = vx_load(src + j + v_uint16::nlanes); + v0 = (thresh_u < v0) & v0; + v1 = (thresh_u < v1) & v1; + v_store(dst + j, v0); + v_store(dst + j + v_uint16::nlanes, v1); + } + if (j <= roi.width - v_uint16::nlanes) + { + v_uint16 v0 = vx_load(src + j); + v0 = (thresh_u < v0) & v0; + v_store(dst + j, v0); + j += v_uint16::nlanes; + } - for (; j < roi.width; j++) - dst[j] = threshToZeroInv(src[j], thresh); + for (; j < roi.width; j++) + dst[j] = threshToZero(src[j], thresh); + } + break; + + case THRESH_TOZERO_INV: + for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step) + { + j = 0; + for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes) + { + v_uint16 v0, v1; + v0 = vx_load(src + j); + v1 = vx_load(src + j + v_uint16::nlanes); + v0 = (v0 <= thresh_u) & v0; + v1 = (v1 <= thresh_u) & v1; + v_store(dst + j, v0); + v_store(dst + j + v_uint16::nlanes, v1); } - break; + if (j <= roi.width - v_uint16::nlanes) + { + v_uint16 v0 = vx_load(src + j); + v0 = (v0 <= thresh_u) & v0; + v_store(dst + j, v0); + j += v_uint16::nlanes; + } + + for (; j < roi.width; j++) + dst[j] = threshToZeroInv(src[j], thresh); } + break; } - else +#else + threshGeneric(roi, src, src_step, dst, dst_step, thresh, maxval, type); #endif - { - threshGeneric(roi, src, src_step, dst, dst_step, thresh, maxval, type); - } } static void @@ -556,128 +583,159 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) } #endif -#if CV_SIMD128 - bool useSIMD = checkHardwareSupport( CV_CPU_SSE2 ) || checkHardwareSupport( CV_CPU_NEON ); - if( useSIMD ) - { - int i, j; - v_int16x8 thresh8 = v_setall_s16( thresh ); - v_int16x8 maxval8 = v_setall_s16( maxval ); +#if CV_SIMD + int i, j; + v_int16 thresh8 = vx_setall_s16( thresh ); + v_int16 maxval8 = vx_setall_s16( maxval ); - switch( type ) + switch( type ) + { + case THRESH_BINARY: + for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { - case THRESH_BINARY: - for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + j = 0; + for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes ) { - j = 0; - for( ; j <= roi.width - 16; j += 16 ) - { - v_int16x8 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 8 ); - v0 = thresh8 < v0; - v1 = thresh8 < v1; - v0 = v0 & maxval8; - v1 = v1 & maxval8; - v_store( dst + j, v0 ); - v_store( dst + j + 8, v1 ); - } - - for( ; j < roi.width; j++ ) - dst[j] = threshBinary(src[j], thresh, maxval); + v_int16 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_int16::nlanes ); + v0 = thresh8 < v0; + v1 = thresh8 < v1; + v0 = v0 & maxval8; + v1 = v1 & maxval8; + v_store( dst + j, v0 ); + v_store( dst + j + v_int16::nlanes, v1 ); } - break; - - case THRESH_BINARY_INV: - for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + if( j <= roi.width - v_int16::nlanes ) { - j = 0; - for( ; j <= roi.width - 16; j += 16 ) - { - v_int16x8 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 8 ); - v0 = v0 <= thresh8; - v1 = v1 <= thresh8; - v0 = v0 & maxval8; - v1 = v1 & maxval8; - v_store( dst + j, v0 ); - v_store( dst + j + 8, v1 ); - } - - for( ; j < roi.width; j++ ) - dst[j] = threshBinaryInv(src[j], thresh, maxval); + v_int16 v0 = vx_load( src + j ); + v0 = thresh8 < v0; + v0 = v0 & maxval8; + v_store( dst + j, v0 ); + j += v_int16::nlanes; } - break; - case THRESH_TRUNC: - for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) - { - j = 0; - for( ; j <= roi.width - 16; j += 16 ) - { - v_int16x8 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 8 ); - v0 = v_min( v0, thresh8 ); - v1 = v_min( v1, thresh8 ); - v_store( dst + j, v0 ); - v_store( dst + j + 8, v1 ); - } + for( ; j < roi.width; j++ ) + dst[j] = threshBinary(src[j], thresh, maxval); + } + break; - for( ; j < roi.width; j++ ) - dst[j] = threshTrunc( src[j], thresh ); + case THRESH_BINARY_INV: + for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + { + j = 0; + for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes ) + { + v_int16 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_int16::nlanes ); + v0 = v0 <= thresh8; + v1 = v1 <= thresh8; + v0 = v0 & maxval8; + v1 = v1 & maxval8; + v_store( dst + j, v0 ); + v_store( dst + j + v_int16::nlanes, v1 ); } - break; - - case THRESH_TOZERO: - for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + if( j <= roi.width - v_int16::nlanes ) { - j = 0; - for( ; j <= roi.width - 16; j += 16 ) - { - v_int16x8 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 8 ); - v0 = ( thresh8 < v0 ) & v0; - v1 = ( thresh8 < v1 ) & v1; - v_store( dst + j, v0 ); - v_store( dst + j + 8, v1 ); - } + v_int16 v0 = vx_load( src + j ); + v0 = v0 <= thresh8; + v0 = v0 & maxval8; + v_store( dst + j, v0 ); + j += v_int16::nlanes; + } - for( ; j < roi.width; j++ ) - dst[j] = threshToZero(src[j], thresh); + for( ; j < roi.width; j++ ) + dst[j] = threshBinaryInv(src[j], thresh, maxval); + } + break; + + case THRESH_TRUNC: + for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + { + j = 0; + for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes ) + { + v_int16 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_int16::nlanes ); + v0 = v_min( v0, thresh8 ); + v1 = v_min( v1, thresh8 ); + v_store( dst + j, v0 ); + v_store( dst + j + v_int16::nlanes, v1 ); + } + if( j <= roi.width - v_int16::nlanes ) + { + v_int16 v0 = vx_load( src + j ); + v0 = v_min( v0, thresh8 ); + v_store( dst + j, v0 ); + j += v_int16::nlanes; } - break; - case THRESH_TOZERO_INV: - for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + for( ; j < roi.width; j++ ) + dst[j] = threshTrunc( src[j], thresh ); + } + break; + + case THRESH_TOZERO: + for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + { + j = 0; + for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes ) { - j = 0; - for( ; j <= roi.width - 16; j += 16 ) - { - v_int16x8 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 8 ); - v0 = ( v0 <= thresh8 ) & v0; - v1 = ( v1 <= thresh8 ) & v1; - v_store( dst + j, v0 ); - v_store( dst + j + 8, v1 ); - } + v_int16 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_int16::nlanes ); + v0 = ( thresh8 < v0 ) & v0; + v1 = ( thresh8 < v1 ) & v1; + v_store( dst + j, v0 ); + v_store( dst + j + v_int16::nlanes, v1 ); + } + if( j <= roi.width - v_int16::nlanes ) + { + v_int16 v0 = vx_load( src + j ); + v0 = ( thresh8 < v0 ) & v0; + v_store( dst + j, v0 ); + j += v_int16::nlanes; + } - for( ; j < roi.width; j++ ) - dst[j] = threshToZeroInv(src[j], thresh); + for( ; j < roi.width; j++ ) + dst[j] = threshToZero(src[j], thresh); + } + break; + + case THRESH_TOZERO_INV: + for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + { + j = 0; + for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes ) + { + v_int16 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_int16::nlanes ); + v0 = ( v0 <= thresh8 ) & v0; + v1 = ( v1 <= thresh8 ) & v1; + v_store( dst + j, v0 ); + v_store( dst + j + v_int16::nlanes, v1 ); } - break; - default: - CV_Error( CV_StsBadArg, "" ); return; + if( j <= roi.width - v_int16::nlanes ) + { + v_int16 v0 = vx_load( src + j ); + v0 = ( v0 <= thresh8 ) & v0; + v_store( dst + j, v0 ); + j += v_int16::nlanes; + } + + for( ; j < roi.width; j++ ) + dst[j] = threshToZeroInv(src[j], thresh); } + break; + default: + CV_Error( CV_StsBadArg, "" ); return; } - else +#else + threshGeneric(roi, src, src_step, dst, dst_step, thresh, maxval, type); #endif - { - threshGeneric(roi, src, src_step, dst, dst_step, thresh, maxval, type); - } } @@ -736,175 +794,40 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) } #endif -#if CV_SIMD128 - bool useSIMD = checkHardwareSupport( CV_CPU_SSE2 ) || checkHardwareSupport( CV_CPU_NEON ); - if( useSIMD ) - { - int i, j; - v_float32x4 thresh4 = v_setall_f32( thresh ); - v_float32x4 maxval4 = v_setall_f32( maxval ); - - switch( type ) - { - case THRESH_BINARY: - for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) - { - j = 0; - for( ; j <= roi.width - 8; j += 8 ) - { - v_float32x4 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 4 ); - v0 = thresh4 < v0; - v1 = thresh4 < v1; - v0 = v0 & maxval4; - v1 = v1 & maxval4; - v_store( dst + j, v0 ); - v_store( dst + j + 4, v1 ); - } - - for( ; j < roi.width; j++ ) - dst[j] = threshBinary(src[j], thresh, maxval); - } - break; - - case THRESH_BINARY_INV: - for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) - { - j = 0; - for( ; j <= roi.width - 8; j += 8 ) - { - v_float32x4 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 4 ); - v0 = v0 <= thresh4; - v1 = v1 <= thresh4; - v0 = v0 & maxval4; - v1 = v1 & maxval4; - v_store( dst + j, v0 ); - v_store( dst + j + 4, v1 ); - } - - for( ; j < roi.width; j++ ) - dst[j] = threshBinaryInv(src[j], thresh, maxval); - } - break; - - case THRESH_TRUNC: - for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) - { - j = 0; - for( ; j <= roi.width - 8; j += 8 ) - { - v_float32x4 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 4 ); - v0 = v_min( v0, thresh4 ); - v1 = v_min( v1, thresh4 ); - v_store( dst + j, v0 ); - v_store( dst + j + 4, v1 ); - } - - for( ; j < roi.width; j++ ) - dst[j] = threshTrunc(src[j], thresh); - } - break; - - case THRESH_TOZERO: - for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) - { - j = 0; - for( ; j <= roi.width - 8; j += 8 ) - { - v_float32x4 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 4 ); - v0 = ( thresh4 < v0 ) & v0; - v1 = ( thresh4 < v1 ) & v1; - v_store( dst + j, v0 ); - v_store( dst + j + 4, v1 ); - } - - for( ; j < roi.width; j++ ) - dst[j] = threshToZero(src[j], thresh); - } - break; - - case THRESH_TOZERO_INV: - for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) - { - j = 0; - for( ; j <= roi.width - 8; j += 8 ) - { - v_float32x4 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 4 ); - v0 = ( v0 <= thresh4 ) & v0; - v1 = ( v1 <= thresh4 ) & v1; - v_store( dst + j, v0 ); - v_store( dst + j + 4, v1 ); - } - - for( ; j < roi.width; j++ ) - dst[j] = threshToZeroInv(src[j], thresh); - } - break; - default: - CV_Error( CV_StsBadArg, "" ); return; - } - } - else -#endif - { - threshGeneric(roi, src, src_step, dst, dst_step, thresh, maxval, type); - } -} - -static void -thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) -{ - Size roi = _src.size(); - roi.width *= _src.channels(); - const double* src = _src.ptr(); - double* dst = _dst.ptr(); - size_t src_step = _src.step / sizeof(src[0]); - size_t dst_step = _dst.step / sizeof(dst[0]); - - if (_src.isContinuous() && _dst.isContinuous()) - { - roi.width *= roi.height; - roi.height = 1; - } +#if CV_SIMD + int i, j; + v_float32 thresh4 = vx_setall_f32( thresh ); + v_float32 maxval4 = vx_setall_f32( maxval ); -#if CV_SIMD128_64F - bool useSIMD = checkHardwareSupport( CV_CPU_SSE2 ) || checkHardwareSupport( CV_CPU_NEON ); - if( useSIMD ) + switch( type ) { - int i, j; - v_float64x2 thresh2 = v_setall_f64( thresh ); - v_float64x2 maxval2 = v_setall_f64( maxval ); - - switch( type ) - { case THRESH_BINARY: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; - for( ; j <= roi.width - 4; j += 4 ) + for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes ) { - v_float64x2 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 2 ); - v0 = thresh2 < v0; - v1 = thresh2 < v1; - v0 = v0 & maxval2; - v1 = v1 & maxval2; + v_float32 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_float32::nlanes ); + v0 = thresh4 < v0; + v1 = thresh4 < v1; + v0 = v0 & maxval4; + v1 = v1 & maxval4; v_store( dst + j, v0 ); - v_store( dst + j + 2, v1 ); + v_store( dst + j + v_float32::nlanes, v1 ); + } + if( j <= roi.width - v_float32::nlanes ) + { + v_float32 v0 = vx_load( src + j ); + v0 = thresh4 < v0; + v0 = v0 & maxval4; + v_store( dst + j, v0 ); + j += v_float32::nlanes; } for( ; j < roi.width; j++ ) - dst[j] = threshBinary(src[j], thresh, maxval); + dst[j] = threshBinary(src[j], thresh, maxval); } break; @@ -912,21 +835,29 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; - for( ; j <= roi.width - 4; j += 4 ) + for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes ) { - v_float64x2 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 2 ); - v0 = v0 <= thresh2; - v1 = v1 <= thresh2; - v0 = v0 & maxval2; - v1 = v1 & maxval2; + v_float32 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_float32::nlanes ); + v0 = v0 <= thresh4; + v1 = v1 <= thresh4; + v0 = v0 & maxval4; + v1 = v1 & maxval4; v_store( dst + j, v0 ); - v_store( dst + j + 2, v1 ); + v_store( dst + j + v_float32::nlanes, v1 ); + } + if( j <= roi.width - v_float32::nlanes ) + { + v_float32 v0 = vx_load( src + j ); + v0 = v0 <= thresh4; + v0 = v0 & maxval4; + v_store( dst + j, v0 ); + j += v_float32::nlanes; } for( ; j < roi.width; j++ ) - dst[j] = threshBinaryInv(src[j], thresh, maxval); + dst[j] = threshBinaryInv(src[j], thresh, maxval); } break; @@ -934,19 +865,26 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; - for( ; j <= roi.width - 4; j += 4 ) + for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes ) + { + v_float32 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_float32::nlanes ); + v0 = v_min( v0, thresh4 ); + v1 = v_min( v1, thresh4 ); + v_store( dst + j, v0 ); + v_store( dst + j + v_float32::nlanes, v1 ); + } + if( j <= roi.width - v_float32::nlanes ) { - v_float64x2 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 2 ); - v0 = v_min( v0, thresh2 ); - v1 = v_min( v1, thresh2 ); + v_float32 v0 = vx_load( src + j ); + v0 = v_min( v0, thresh4 ); v_store( dst + j, v0 ); - v_store( dst + j + 2, v1 ); + j += v_float32::nlanes; } for( ; j < roi.width; j++ ) - dst[j] = threshTrunc(src[j], thresh); + dst[j] = threshTrunc(src[j], thresh); } break; @@ -954,19 +892,26 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; - for( ; j <= roi.width - 4; j += 4 ) + for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes ) + { + v_float32 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_float32::nlanes ); + v0 = ( thresh4 < v0 ) & v0; + v1 = ( thresh4 < v1 ) & v1; + v_store( dst + j, v0 ); + v_store( dst + j + v_float32::nlanes, v1 ); + } + if( j <= roi.width - v_float32::nlanes ) { - v_float64x2 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 2 ); - v0 = ( thresh2 < v0 ) & v0; - v1 = ( thresh2 < v1 ) & v1; + v_float32 v0 = vx_load( src + j ); + v0 = ( thresh4 < v0 ) & v0; v_store( dst + j, v0 ); - v_store( dst + j + 2, v1 ); + j += v_float32::nlanes; } for( ; j < roi.width; j++ ) - dst[j] = threshToZero(src[j], thresh); + dst[j] = threshToZero(src[j], thresh); } break; @@ -974,30 +919,205 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; - for( ; j <= roi.width - 4; j += 4 ) + for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes ) + { + v_float32 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_float32::nlanes ); + v0 = ( v0 <= thresh4 ) & v0; + v1 = ( v1 <= thresh4 ) & v1; + v_store( dst + j, v0 ); + v_store( dst + j + v_float32::nlanes, v1 ); + } + if( j <= roi.width - v_float32::nlanes ) { - v_float64x2 v0, v1; - v0 = v_load( src + j ); - v1 = v_load( src + j + 2 ); - v0 = ( v0 <= thresh2 ) & v0; - v1 = ( v1 <= thresh2 ) & v1; + v_float32 v0 = vx_load( src + j ); + v0 = ( v0 <= thresh4 ) & v0; v_store( dst + j, v0 ); - v_store( dst + j + 2, v1 ); + j += v_float32::nlanes; } for( ; j < roi.width; j++ ) - dst[j] = threshToZeroInv(src[j], thresh); + dst[j] = threshToZeroInv(src[j], thresh); } break; default: - CV_Error(CV_StsBadArg, ""); return; - } + CV_Error( CV_StsBadArg, "" ); return; } - else +#else + threshGeneric(roi, src, src_step, dst, dst_step, thresh, maxval, type); #endif +} + +static void +thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type) +{ + Size roi = _src.size(); + roi.width *= _src.channels(); + const double* src = _src.ptr(); + double* dst = _dst.ptr(); + size_t src_step = _src.step / sizeof(src[0]); + size_t dst_step = _dst.step / sizeof(dst[0]); + + if (_src.isContinuous() && _dst.isContinuous()) { - threshGeneric(roi, src, src_step, dst, dst_step, thresh, maxval, type); + roi.width *= roi.height; + roi.height = 1; } + +#if CV_SIMD_64F + int i, j; + v_float64 thresh2 = vx_setall_f64( thresh ); + v_float64 maxval2 = vx_setall_f64( maxval ); + + switch( type ) + { + case THRESH_BINARY: + for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + { + j = 0; + for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes ) + { + v_float64 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_float64::nlanes ); + v0 = thresh2 < v0; + v1 = thresh2 < v1; + v0 = v0 & maxval2; + v1 = v1 & maxval2; + v_store( dst + j, v0 ); + v_store( dst + j + v_float64::nlanes, v1 ); + } + if( j <= roi.width - v_float64::nlanes ) + { + v_float64 v0 = vx_load( src + j ); + v0 = thresh2 < v0; + v0 = v0 & maxval2; + v_store( dst + j, v0 ); + j += v_float64::nlanes; + } + + for( ; j < roi.width; j++ ) + dst[j] = threshBinary(src[j], thresh, maxval); + } + break; + + case THRESH_BINARY_INV: + for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + { + j = 0; + for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes ) + { + v_float64 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_float64::nlanes ); + v0 = v0 <= thresh2; + v1 = v1 <= thresh2; + v0 = v0 & maxval2; + v1 = v1 & maxval2; + v_store( dst + j, v0 ); + v_store( dst + j + v_float64::nlanes, v1 ); + } + if( j <= roi.width - v_float64::nlanes ) + { + v_float64 v0 = vx_load( src + j ); + v0 = v0 <= thresh2; + v0 = v0 & maxval2; + v_store( dst + j, v0 ); + j += v_float64::nlanes; + } + + for( ; j < roi.width; j++ ) + dst[j] = threshBinaryInv(src[j], thresh, maxval); + } + break; + + case THRESH_TRUNC: + for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + { + j = 0; + for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes ) + { + v_float64 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_float64::nlanes ); + v0 = v_min( v0, thresh2 ); + v1 = v_min( v1, thresh2 ); + v_store( dst + j, v0 ); + v_store( dst + j + v_float64::nlanes, v1 ); + } + if( j <= roi.width - v_float64::nlanes ) + { + v_float64 v0 = vx_load( src + j ); + v0 = v_min( v0, thresh2 ); + v_store( dst + j, v0 ); + j += v_float64::nlanes; + } + + for( ; j < roi.width; j++ ) + dst[j] = threshTrunc(src[j], thresh); + } + break; + + case THRESH_TOZERO: + for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + { + j = 0; + for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes ) + { + v_float64 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_float64::nlanes ); + v0 = ( thresh2 < v0 ) & v0; + v1 = ( thresh2 < v1 ) & v1; + v_store( dst + j, v0 ); + v_store( dst + j + v_float64::nlanes, v1 ); + } + if( j <= roi.width - v_float64::nlanes ) + { + v_float64 v0 = vx_load( src + j ); + v0 = ( thresh2 < v0 ) & v0; + v_store( dst + j, v0 ); + j += v_float64::nlanes; + } + + for( ; j < roi.width; j++ ) + dst[j] = threshToZero(src[j], thresh); + } + break; + + case THRESH_TOZERO_INV: + for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) + { + j = 0; + for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes ) + { + v_float64 v0, v1; + v0 = vx_load( src + j ); + v1 = vx_load( src + j + v_float64::nlanes ); + v0 = ( v0 <= thresh2 ) & v0; + v1 = ( v1 <= thresh2 ) & v1; + v_store( dst + j, v0 ); + v_store( dst + j + v_float64::nlanes, v1 ); + } + if( j <= roi.width - v_float64::nlanes ) + { + v_float64 v0 = vx_load( src + j ); + v0 = ( v0 <= thresh2 ) & v0; + v_store( dst + j, v0 ); + j += v_float64::nlanes; + } + + for( ; j < roi.width; j++ ) + dst[j] = threshToZeroInv(src[j], thresh); + } + break; + default: + CV_Error(CV_StsBadArg, ""); return; + } +#else + threshGeneric(roi, src, src_step, dst, dst_step, thresh, maxval, type); +#endif } #ifdef HAVE_IPP