|
|
|
@ -2440,6 +2440,34 @@ addWeighted8u( const uchar* src1, size_t step1, |
|
|
|
|
_mm_storel_epi64((__m128i*)(dst + x), u); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#elif CV_NEON |
|
|
|
|
float32x4_t g = vdupq_n_f32 (gamma); |
|
|
|
|
|
|
|
|
|
for( ; x <= size.width - 8; x += 8 ) |
|
|
|
|
{ |
|
|
|
|
uint8x8_t in1 = vld1_u8(src1+x); |
|
|
|
|
uint16x8_t in1_16 = vmovl_u8(in1); |
|
|
|
|
float32x4_t in1_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in1_16))); |
|
|
|
|
float32x4_t in1_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in1_16))); |
|
|
|
|
|
|
|
|
|
uint8x8_t in2 = vld1_u8(src2+x); |
|
|
|
|
uint16x8_t in2_16 = vmovl_u8(in2); |
|
|
|
|
float32x4_t in2_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in2_16))); |
|
|
|
|
float32x4_t in2_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in2_16))); |
|
|
|
|
|
|
|
|
|
float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta)); |
|
|
|
|
float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta)); |
|
|
|
|
out_f_l = vaddq_f32(out_f_l, g); |
|
|
|
|
out_f_h = vaddq_f32(out_f_h, g); |
|
|
|
|
|
|
|
|
|
uint16x4_t out_16_l = vqmovn_u32(vcvtq_u32_f32(out_f_l)); |
|
|
|
|
uint16x4_t out_16_h = vqmovn_u32(vcvtq_u32_f32(out_f_h)); |
|
|
|
|
|
|
|
|
|
uint16x8_t out_16 = vcombine_u16(out_16_l, out_16_h); |
|
|
|
|
uint8x8_t out = vqmovn_u16(out_16); |
|
|
|
|
|
|
|
|
|
vst1_u8(dst+x, out); |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
#if CV_ENABLE_UNROLLED |
|
|
|
|
for( ; x <= size.width - 4; x += 4 ) |
|
|
|
@ -2650,6 +2678,14 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste |
|
|
|
|
|
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#elif CV_NEON |
|
|
|
|
uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255); |
|
|
|
|
|
|
|
|
|
for( ; x <= size.width - 16; x += 16 ) |
|
|
|
|
{ |
|
|
|
|
vst1q_u8(dst+x, veorq_u8(vcgtq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
for( ; x < size.width; x++ ){ |
|
|
|
@ -2674,6 +2710,13 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste |
|
|
|
|
_mm_storeu_si128((__m128i*)(dst + x), r00); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#elif CV_NEON |
|
|
|
|
uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255); |
|
|
|
|
|
|
|
|
|
for( ; x <= size.width - 16; x += 16 ) |
|
|
|
|
{ |
|
|
|
|
vst1q_u8(dst+x, veorq_u8(vceqq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask)); |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
for( ; x < size.width; x++ ) |
|
|
|
|
dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m); |
|
|
|
@ -2759,6 +2802,22 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st |
|
|
|
|
x += 8; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#elif CV_NEON |
|
|
|
|
uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255); |
|
|
|
|
|
|
|
|
|
for( ; x <= size.width - 16; x += 16 ) |
|
|
|
|
{ |
|
|
|
|
int16x8_t in1 = vld1q_s16(src1 + x); |
|
|
|
|
int16x8_t in2 = vld1q_s16(src2 + x); |
|
|
|
|
uint8x8_t t1 = vmovn_u16(vcgtq_s16(in1, in2)); |
|
|
|
|
|
|
|
|
|
in1 = vld1q_s16(src1 + x + 8); |
|
|
|
|
in2 = vld1q_s16(src2 + x + 8); |
|
|
|
|
uint8x8_t t2 = vmovn_u16(vcgtq_s16(in1, in2)); |
|
|
|
|
|
|
|
|
|
vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
for( ; x < size.width; x++ ){ |
|
|
|
@ -2797,6 +2856,21 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st |
|
|
|
|
x += 8; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#elif CV_NEON |
|
|
|
|
uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255); |
|
|
|
|
|
|
|
|
|
for( ; x <= size.width - 16; x += 16 ) |
|
|
|
|
{ |
|
|
|
|
int16x8_t in1 = vld1q_s16(src1 + x); |
|
|
|
|
int16x8_t in2 = vld1q_s16(src2 + x); |
|
|
|
|
uint8x8_t t1 = vmovn_u16(vceqq_s16(in1, in2)); |
|
|
|
|
|
|
|
|
|
in1 = vld1q_s16(src1 + x + 8); |
|
|
|
|
in2 = vld1q_s16(src2 + x + 8); |
|
|
|
|
uint8x8_t t2 = vmovn_u16(vceqq_s16(in1, in2)); |
|
|
|
|
|
|
|
|
|
vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask)); |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
for( ; x < size.width; x++ ) |
|
|
|
|
dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m); |
|
|
|
@ -3085,7 +3159,7 @@ namespace cv |
|
|
|
|
{ |
|
|
|
|
|
|
|
|
|
template <typename T> |
|
|
|
|
struct InRange_SSE |
|
|
|
|
struct InRange_SIMD |
|
|
|
|
{ |
|
|
|
|
int operator () (const T *, const T *, const T *, uchar *, int) const |
|
|
|
|
{ |
|
|
|
@ -3096,7 +3170,7 @@ struct InRange_SSE |
|
|
|
|
#if CV_SSE2 |
|
|
|
|
|
|
|
|
|
template <> |
|
|
|
|
struct InRange_SSE<uchar> |
|
|
|
|
struct InRange_SIMD<uchar> |
|
|
|
|
{ |
|
|
|
|
int operator () (const uchar * src1, const uchar * src2, const uchar * src3, |
|
|
|
|
uchar * dst, int len) const |
|
|
|
@ -3121,7 +3195,7 @@ struct InRange_SSE<uchar> |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
template <> |
|
|
|
|
struct InRange_SSE<schar> |
|
|
|
|
struct InRange_SIMD<schar> |
|
|
|
|
{ |
|
|
|
|
int operator () (const schar * src1, const schar * src2, const schar * src3, |
|
|
|
|
uchar * dst, int len) const |
|
|
|
@ -3146,7 +3220,7 @@ struct InRange_SSE<schar> |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
template <> |
|
|
|
|
struct InRange_SSE<ushort> |
|
|
|
|
struct InRange_SIMD<ushort> |
|
|
|
|
{ |
|
|
|
|
int operator () (const ushort * src1, const ushort * src2, const ushort * src3, |
|
|
|
|
uchar * dst, int len) const |
|
|
|
@ -3172,7 +3246,7 @@ struct InRange_SSE<ushort> |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
template <> |
|
|
|
|
struct InRange_SSE<short> |
|
|
|
|
struct InRange_SIMD<short> |
|
|
|
|
{ |
|
|
|
|
int operator () (const short * src1, const short * src2, const short * src3, |
|
|
|
|
uchar * dst, int len) const |
|
|
|
@ -3198,7 +3272,7 @@ struct InRange_SSE<short> |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
template <> |
|
|
|
|
struct InRange_SSE<int> |
|
|
|
|
struct InRange_SIMD<int> |
|
|
|
|
{ |
|
|
|
|
int operator () (const int * src1, const int * src2, const int * src3, |
|
|
|
|
uchar * dst, int len) const |
|
|
|
@ -3230,7 +3304,7 @@ struct InRange_SSE<int> |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
template <> |
|
|
|
|
struct InRange_SSE<float> |
|
|
|
|
struct InRange_SIMD<float> |
|
|
|
|
{ |
|
|
|
|
int operator () (const float * src1, const float * src2, const float * src3, |
|
|
|
|
uchar * dst, int len) const |
|
|
|
@ -3261,6 +3335,160 @@ struct InRange_SSE<float> |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
#elif CV_NEON |
|
|
|
|
|
|
|
|
|
template <> |
|
|
|
|
struct InRange_SIMD<uchar> |
|
|
|
|
{ |
|
|
|
|
int operator () (const uchar * src1, const uchar * src2, const uchar * src3, |
|
|
|
|
uchar * dst, int len) const |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
|
|
|
|
|
for ( ; x <= len - 16; x += 16 ) |
|
|
|
|
{ |
|
|
|
|
uint8x16_t values = vld1q_u8(src1 + x); |
|
|
|
|
uint8x16_t low = vld1q_u8(src2 + x); |
|
|
|
|
uint8x16_t high = vld1q_u8(src3 + x); |
|
|
|
|
|
|
|
|
|
vst1q_u8(dst + x, vandq_u8(vcgeq_u8(values, low), vcgeq_u8(high, values))); |
|
|
|
|
} |
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
template <> |
|
|
|
|
struct InRange_SIMD<schar> |
|
|
|
|
{ |
|
|
|
|
int operator () (const schar * src1, const schar * src2, const schar * src3, |
|
|
|
|
uchar * dst, int len) const |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
|
|
|
|
|
for ( ; x <= len - 16; x += 16 ) |
|
|
|
|
{ |
|
|
|
|
int8x16_t values = vld1q_s8(src1 + x); |
|
|
|
|
int8x16_t low = vld1q_s8(src2 + x); |
|
|
|
|
int8x16_t high = vld1q_s8(src3 + x); |
|
|
|
|
|
|
|
|
|
vst1q_u8(dst + x, vandq_u8(vcgeq_s8(values, low), vcgeq_s8(high, values))); |
|
|
|
|
} |
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
template <> |
|
|
|
|
struct InRange_SIMD<ushort> |
|
|
|
|
{ |
|
|
|
|
int operator () (const ushort * src1, const ushort * src2, const ushort * src3, |
|
|
|
|
uchar * dst, int len) const |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
|
|
|
|
|
for ( ; x <= len - 16; x += 16 ) |
|
|
|
|
{ |
|
|
|
|
uint16x8_t values = vld1q_u16((const uint16_t*)(src1 + x)); |
|
|
|
|
uint16x8_t low = vld1q_u16((const uint16_t*)(src2 + x)); |
|
|
|
|
uint16x8_t high = vld1q_u16((const uint16_t*)(src3 + x)); |
|
|
|
|
uint8x8_t r1 = vmovn_u16(vandq_u16(vcgeq_u16(values, low), vcgeq_u16(high, values))); |
|
|
|
|
|
|
|
|
|
values = vld1q_u16((const uint16_t*)(src1 + x + 8)); |
|
|
|
|
low = vld1q_u16((const uint16_t*)(src2 + x + 8)); |
|
|
|
|
high = vld1q_u16((const uint16_t*)(src3 + x + 8)); |
|
|
|
|
uint8x8_t r2 = vmovn_u16(vandq_u16(vcgeq_u16(values, low), vcgeq_u16(high, values))); |
|
|
|
|
|
|
|
|
|
vst1q_u8(dst + x, vcombine_u8(r1, r2)); |
|
|
|
|
} |
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
template <> |
|
|
|
|
struct InRange_SIMD<short> |
|
|
|
|
{ |
|
|
|
|
int operator () (const short * src1, const short * src2, const short * src3, |
|
|
|
|
uchar * dst, int len) const |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
|
|
|
|
|
for ( ; x <= len - 16; x += 16 ) |
|
|
|
|
{ |
|
|
|
|
int16x8_t values = vld1q_s16((const int16_t*)(src1 + x)); |
|
|
|
|
int16x8_t low = vld1q_s16((const int16_t*)(src2 + x)); |
|
|
|
|
int16x8_t high = vld1q_s16((const int16_t*)(src3 + x)); |
|
|
|
|
uint8x8_t r1 = vmovn_u16(vandq_u16(vcgeq_s16(values, low), vcgeq_s16(high, values))); |
|
|
|
|
|
|
|
|
|
values = vld1q_s16((const int16_t*)(src1 + x + 8)); |
|
|
|
|
low = vld1q_s16((const int16_t*)(src2 + x + 8)); |
|
|
|
|
high = vld1q_s16((const int16_t*)(src3 + x + 8)); |
|
|
|
|
uint8x8_t r2 = vmovn_u16(vandq_u16(vcgeq_s16(values, low), vcgeq_s16(high, values))); |
|
|
|
|
|
|
|
|
|
vst1q_u8(dst + x, vcombine_u8(r1, r2)); |
|
|
|
|
} |
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
template <> |
|
|
|
|
struct InRange_SIMD<int> |
|
|
|
|
{ |
|
|
|
|
int operator () (const int * src1, const int * src2, const int * src3, |
|
|
|
|
uchar * dst, int len) const |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
|
|
|
|
|
for ( ; x <= len - 8; x += 8 ) |
|
|
|
|
{ |
|
|
|
|
int32x4_t values = vld1q_s32((const int32_t*)(src1 + x)); |
|
|
|
|
int32x4_t low = vld1q_s32((const int32_t*)(src2 + x)); |
|
|
|
|
int32x4_t high = vld1q_s32((const int32_t*)(src3 + x)); |
|
|
|
|
|
|
|
|
|
uint16x4_t r1 = vmovn_u32(vandq_u32(vcgeq_s32(values, low), vcgeq_s32(high, values))); |
|
|
|
|
|
|
|
|
|
values = vld1q_s32((const int32_t*)(src1 + x + 4)); |
|
|
|
|
low = vld1q_s32((const int32_t*)(src2 + x + 4)); |
|
|
|
|
high = vld1q_s32((const int32_t*)(src3 + x + 4)); |
|
|
|
|
|
|
|
|
|
uint16x4_t r2 = vmovn_u32(vandq_u32(vcgeq_s32(values, low), vcgeq_s32(high, values))); |
|
|
|
|
|
|
|
|
|
uint16x8_t res_16 = vcombine_u16(r1, r2); |
|
|
|
|
|
|
|
|
|
vst1_u8(dst + x, vmovn_u16(res_16)); |
|
|
|
|
} |
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
template <> |
|
|
|
|
struct InRange_SIMD<float> |
|
|
|
|
{ |
|
|
|
|
int operator () (const float * src1, const float * src2, const float * src3, |
|
|
|
|
uchar * dst, int len) const |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
|
|
|
|
|
for ( ; x <= len - 8; x += 8 ) |
|
|
|
|
{ |
|
|
|
|
float32x4_t values = vld1q_f32((const float32_t*)(src1 + x)); |
|
|
|
|
float32x4_t low = vld1q_f32((const float32_t*)(src2 + x)); |
|
|
|
|
float32x4_t high = vld1q_f32((const float32_t*)(src3 + x)); |
|
|
|
|
|
|
|
|
|
uint16x4_t r1 = vmovn_u32(vandq_u32(vcgeq_f32(values, low), vcgeq_f32(high, values))); |
|
|
|
|
|
|
|
|
|
values = vld1q_f32((const float32_t*)(src1 + x + 4)); |
|
|
|
|
low = vld1q_f32((const float32_t*)(src2 + x + 4)); |
|
|
|
|
high = vld1q_f32((const float32_t*)(src3 + x + 4)); |
|
|
|
|
|
|
|
|
|
uint16x4_t r2 = vmovn_u32(vandq_u32(vcgeq_f32(values, low), vcgeq_f32(high, values))); |
|
|
|
|
|
|
|
|
|
uint16x8_t res_16 = vcombine_u16(r1, r2); |
|
|
|
|
|
|
|
|
|
vst1_u8(dst + x, vmovn_u16(res_16)); |
|
|
|
|
} |
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
template <typename T> |
|
|
|
@ -3272,7 +3500,7 @@ static void inRange_(const T* src1, size_t step1, const T* src2, size_t step2, |
|
|
|
|
step2 /= sizeof(src2[0]); |
|
|
|
|
step3 /= sizeof(src3[0]); |
|
|
|
|
|
|
|
|
|
InRange_SSE<T> vop; |
|
|
|
|
InRange_SIMD<T> vop; |
|
|
|
|
|
|
|
|
|
for( ; size.height--; src1 += step1, src2 += step2, src3 += step3, dst += step ) |
|
|
|
|
{ |
|
|
|
|