|
|
|
@ -178,9 +178,153 @@ struct PyrDownVec_32f |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
typedef NoVec<int, ushort> PyrDownVec_32s16u; |
|
|
|
|
typedef NoVec<int, short> PyrDownVec_32s16s; |
|
|
|
|
|
|
|
|
|
#elif CV_NEON |
|
|
|
|
|
|
|
|
|
struct PyrDownVec_32s8u |
|
|
|
|
{ |
|
|
|
|
int operator()(int** src, uchar* dst, int, int width) const |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
const unsigned int *row0 = (unsigned int*)src[0], *row1 = (unsigned int*)src[1], |
|
|
|
|
*row2 = (unsigned int*)src[2], *row3 = (unsigned int*)src[3], |
|
|
|
|
*row4 = (unsigned int*)src[4]; |
|
|
|
|
uint16x8_t v_delta = vdupq_n_u16(128); |
|
|
|
|
|
|
|
|
|
for( ; x <= width - 16; x += 16 ) |
|
|
|
|
{ |
|
|
|
|
uint16x8_t v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x)), vqmovn_u32(vld1q_u32(row0 + x + 4))); |
|
|
|
|
uint16x8_t v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x)), vqmovn_u32(vld1q_u32(row1 + x + 4))); |
|
|
|
|
uint16x8_t v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x)), vqmovn_u32(vld1q_u32(row2 + x + 4))); |
|
|
|
|
uint16x8_t v_r3 = vcombine_u16(vqmovn_u32(vld1q_u32(row3 + x)), vqmovn_u32(vld1q_u32(row3 + x + 4))); |
|
|
|
|
uint16x8_t v_r4 = vcombine_u16(vqmovn_u32(vld1q_u32(row4 + x)), vqmovn_u32(vld1q_u32(row4 + x + 4))); |
|
|
|
|
|
|
|
|
|
v_r0 = vqaddq_u16(vqaddq_u16(v_r0, v_r4), vqaddq_u16(v_r2, v_r2)); |
|
|
|
|
v_r1 = vqaddq_u16(vqaddq_u16(v_r1, v_r2), v_r3); |
|
|
|
|
uint16x8_t v_dst0 = vqaddq_u16(v_r0, vshlq_n_u16(v_r1, 2)); |
|
|
|
|
|
|
|
|
|
v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x + 8)), vqmovn_u32(vld1q_u32(row0 + x + 12))); |
|
|
|
|
v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x + 8)), vqmovn_u32(vld1q_u32(row1 + x + 12))); |
|
|
|
|
v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x + 8)), vqmovn_u32(vld1q_u32(row2 + x + 12))); |
|
|
|
|
v_r3 = vcombine_u16(vqmovn_u32(vld1q_u32(row3 + x + 8)), vqmovn_u32(vld1q_u32(row3 + x + 12))); |
|
|
|
|
v_r4 = vcombine_u16(vqmovn_u32(vld1q_u32(row4 + x + 8)), vqmovn_u32(vld1q_u32(row4 + x + 12))); |
|
|
|
|
|
|
|
|
|
v_r0 = vqaddq_u16(vqaddq_u16(v_r0, v_r4), vqaddq_u16(v_r2, v_r2)); |
|
|
|
|
v_r1 = vqaddq_u16(vqaddq_u16(v_r1, v_r2), v_r3); |
|
|
|
|
uint16x8_t v_dst1 = vqaddq_u16(v_r0, vshlq_n_u16(v_r1, 2)); |
|
|
|
|
|
|
|
|
|
vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst0, v_delta), 8)), |
|
|
|
|
vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst1, v_delta), 8)))); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
struct PyrDownVec_32s16u |
|
|
|
|
{ |
|
|
|
|
int operator()(int** src, ushort* dst, int, int width) const |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; |
|
|
|
|
int32x4_t v_delta = vdupq_n_s32(128); |
|
|
|
|
|
|
|
|
|
for( ; x <= width - 8; x += 8 ) |
|
|
|
|
{ |
|
|
|
|
int32x4_t v_r00 = vld1q_s32(row0 + x), v_r01 = vld1q_s32(row0 + x + 4); |
|
|
|
|
int32x4_t v_r10 = vld1q_s32(row1 + x), v_r11 = vld1q_s32(row1 + x + 4); |
|
|
|
|
int32x4_t v_r20 = vld1q_s32(row2 + x), v_r21 = vld1q_s32(row2 + x + 4); |
|
|
|
|
int32x4_t v_r30 = vld1q_s32(row3 + x), v_r31 = vld1q_s32(row3 + x + 4); |
|
|
|
|
int32x4_t v_r40 = vld1q_s32(row4 + x), v_r41 = vld1q_s32(row4 + x + 4); |
|
|
|
|
|
|
|
|
|
v_r00 = vaddq_s32(vqaddq_s32(v_r00, v_r40), vqaddq_s32(v_r20, v_r20)); |
|
|
|
|
v_r10 = vaddq_s32(vqaddq_s32(v_r10, v_r20), v_r30); |
|
|
|
|
int32x4_t v_dst0 = vshrq_n_s32(vaddq_s32(vqaddq_s32(v_r00, vshlq_n_s32(v_r10, 2)), v_delta), 8); |
|
|
|
|
|
|
|
|
|
v_r01 = vaddq_s32(vqaddq_s32(v_r01, v_r41), vqaddq_s32(v_r21, v_r21)); |
|
|
|
|
v_r11 = vaddq_s32(vqaddq_s32(v_r11, v_r21), v_r31); |
|
|
|
|
int32x4_t v_dst1 = vshrq_n_s32(vaddq_s32(vqaddq_s32(v_r01, vshlq_n_s32(v_r11, 2)), v_delta), 8); |
|
|
|
|
|
|
|
|
|
vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(v_dst0), vqmovun_s32(v_dst1))); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
struct PyrDownVec_32s16s |
|
|
|
|
{ |
|
|
|
|
int operator()(int** src, short* dst, int, int width) const |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; |
|
|
|
|
int32x4_t v_delta = vdupq_n_s32(128); |
|
|
|
|
|
|
|
|
|
for( ; x <= width - 8; x += 8 ) |
|
|
|
|
{ |
|
|
|
|
int32x4_t v_r00 = vld1q_s32(row0 + x), v_r01 = vld1q_s32(row0 + x + 4); |
|
|
|
|
int32x4_t v_r10 = vld1q_s32(row1 + x), v_r11 = vld1q_s32(row1 + x + 4); |
|
|
|
|
int32x4_t v_r20 = vld1q_s32(row2 + x), v_r21 = vld1q_s32(row2 + x + 4); |
|
|
|
|
int32x4_t v_r30 = vld1q_s32(row3 + x), v_r31 = vld1q_s32(row3 + x + 4); |
|
|
|
|
int32x4_t v_r40 = vld1q_s32(row4 + x), v_r41 = vld1q_s32(row4 + x + 4); |
|
|
|
|
|
|
|
|
|
v_r00 = vaddq_s32(vqaddq_s32(v_r00, v_r40), vqaddq_s32(v_r20, v_r20)); |
|
|
|
|
v_r10 = vaddq_s32(vqaddq_s32(v_r10, v_r20), v_r30); |
|
|
|
|
int32x4_t v_dst0 = vshrq_n_s32(vaddq_s32(vqaddq_s32(v_r00, vshlq_n_s32(v_r10, 2)), v_delta), 8); |
|
|
|
|
|
|
|
|
|
v_r01 = vaddq_s32(vqaddq_s32(v_r01, v_r41), vqaddq_s32(v_r21, v_r21)); |
|
|
|
|
v_r11 = vaddq_s32(vqaddq_s32(v_r11, v_r21), v_r31); |
|
|
|
|
int32x4_t v_dst1 = vshrq_n_s32(vaddq_s32(vqaddq_s32(v_r01, vshlq_n_s32(v_r11, 2)), v_delta), 8); |
|
|
|
|
|
|
|
|
|
vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_dst0), vqmovn_s32(v_dst1))); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
struct PyrDownVec_32f |
|
|
|
|
{ |
|
|
|
|
int operator()(float** src, float* dst, int, int width) const |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
const float *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; |
|
|
|
|
float32x4_t v_4 = vdupq_n_f32(4.0f), v_scale = vdupq_n_f32(1.f/256.0f); |
|
|
|
|
|
|
|
|
|
for( ; x <= width - 8; x += 8 ) |
|
|
|
|
{ |
|
|
|
|
float32x4_t v_r0 = vld1q_f32(row0 + x); |
|
|
|
|
float32x4_t v_r1 = vld1q_f32(row1 + x); |
|
|
|
|
float32x4_t v_r2 = vld1q_f32(row2 + x); |
|
|
|
|
float32x4_t v_r3 = vld1q_f32(row3 + x); |
|
|
|
|
float32x4_t v_r4 = vld1q_f32(row4 + x); |
|
|
|
|
|
|
|
|
|
v_r0 = vaddq_f32(vaddq_f32(v_r0, v_r4), vaddq_f32(v_r2, v_r2)); |
|
|
|
|
v_r1 = vaddq_f32(vaddq_f32(v_r1, v_r2), v_r3); |
|
|
|
|
vst1q_f32(dst + x, vmulq_f32(vmlaq_f32(v_r0, v_4, v_r1), v_scale)); |
|
|
|
|
|
|
|
|
|
v_r0 = vld1q_f32(row0 + x + 4); |
|
|
|
|
v_r1 = vld1q_f32(row1 + x + 4); |
|
|
|
|
v_r2 = vld1q_f32(row2 + x + 4); |
|
|
|
|
v_r3 = vld1q_f32(row3 + x + 4); |
|
|
|
|
v_r4 = vld1q_f32(row4 + x + 4); |
|
|
|
|
|
|
|
|
|
v_r0 = vaddq_f32(vaddq_f32(v_r0, v_r4), vaddq_f32(v_r2, v_r2)); |
|
|
|
|
v_r1 = vaddq_f32(vaddq_f32(v_r1, v_r2), v_r3); |
|
|
|
|
vst1q_f32(dst + x + 4, vmulq_f32(vmlaq_f32(v_r0, v_4, v_r1), v_scale)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
|
|
typedef NoVec<int, uchar> PyrDownVec_32s8u; |
|
|
|
|
typedef NoVec<int, ushort> PyrDownVec_32s16u; |
|
|
|
|
typedef NoVec<int, short> PyrDownVec_32s16s; |
|
|
|
|
typedef NoVec<float, float> PyrDownVec_32f; |
|
|
|
|
|
|
|
|
|
#endif |
|
|
|
@ -561,9 +705,9 @@ void cv::pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, int borde |
|
|
|
|
if( depth == CV_8U ) |
|
|
|
|
func = pyrDown_<FixPtCast<uchar, 8>, PyrDownVec_32s8u>; |
|
|
|
|
else if( depth == CV_16S ) |
|
|
|
|
func = pyrDown_<FixPtCast<short, 8>, NoVec<int, short> >; |
|
|
|
|
func = pyrDown_<FixPtCast<short, 8>, PyrDownVec_32s16s >; |
|
|
|
|
else if( depth == CV_16U ) |
|
|
|
|
func = pyrDown_<FixPtCast<ushort, 8>, NoVec<int, ushort> >; |
|
|
|
|
func = pyrDown_<FixPtCast<ushort, 8>, PyrDownVec_32s16u >; |
|
|
|
|
else if( depth == CV_32F ) |
|
|
|
|
func = pyrDown_<FltCast<float, 8>, PyrDownVec_32f>; |
|
|
|
|
else if( depth == CV_64F ) |
|
|
|
|