|
|
|
@ -133,7 +133,7 @@ static uchar NNDeltaTab_i[INTER_TAB_SIZE2][2]; |
|
|
|
|
static float BilinearTab_f[INTER_TAB_SIZE2][2][2]; |
|
|
|
|
static short BilinearTab_i[INTER_TAB_SIZE2][2][2]; |
|
|
|
|
|
|
|
|
|
#if CV_SSE2 |
|
|
|
|
#if CV_SSE2 || CV_NEON |
|
|
|
|
static short BilinearTab_iC4_buf[INTER_TAB_SIZE2+2][2][8]; |
|
|
|
|
static short (*BilinearTab_iC4)[2][8] = (short (*)[2][8])alignPtr(BilinearTab_iC4_buf, 16); |
|
|
|
|
#endif |
|
|
|
@ -269,7 +269,7 @@ static const void* initInterTab2D( int method, bool fixpt ) |
|
|
|
|
} |
|
|
|
|
tab -= INTER_TAB_SIZE2*ksize*ksize; |
|
|
|
|
itab -= INTER_TAB_SIZE2*ksize*ksize; |
|
|
|
|
#if CV_SSE2 |
|
|
|
|
#if CV_SSE2 || CV_NEON |
|
|
|
|
if( method == INTER_LINEAR ) |
|
|
|
|
{ |
|
|
|
|
for( i = 0; i < INTER_TAB_SIZE2; i++ ) |
|
|
|
@ -894,9 +894,51 @@ struct VResizeCubicVec_32f |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
typedef VResizeNoVec VResizeLanczos4Vec_32f16u; |
|
|
|
|
typedef VResizeNoVec VResizeLanczos4Vec_32f16s; |
|
|
|
|
typedef VResizeNoVec VResizeLanczos4Vec_32f; |
|
|
|
|
|
|
|
|
|
#elif CV_NEON |
|
|
|
|
|
|
|
|
|
typedef VResizeNoVec VResizeLinearVec_32s8u; |
|
|
|
|
struct VResizeLinearVec_32s8u |
|
|
|
|
{ |
|
|
|
|
int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const |
|
|
|
|
{ |
|
|
|
|
const int** src = (const int**)_src, *S0 = src[0], *S1 = src[1]; |
|
|
|
|
const short* beta = (const short*)_beta; |
|
|
|
|
int x = 0; |
|
|
|
|
int16x8_t v_b0 = vdupq_n_s16(beta[0]), v_b1 = vdupq_n_s16(beta[1]), v_delta = vdupq_n_s16(2); |
|
|
|
|
|
|
|
|
|
for( ; x <= width - 16; x += 16) |
|
|
|
|
{ |
|
|
|
|
int32x4_t v_src00 = vshrq_n_s32(vld1q_s32(S0 + x), 4), v_src10 = vshrq_n_s32(vld1q_s32(S1 + x), 4); |
|
|
|
|
int32x4_t v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 4), 4), v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 4), 4); |
|
|
|
|
|
|
|
|
|
int16x8_t v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01)); |
|
|
|
|
int16x8_t v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11)); |
|
|
|
|
|
|
|
|
|
int16x8_t v_dst0 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1), |
|
|
|
|
vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1)); |
|
|
|
|
v_dst0 = vshrq_n_s16(vaddq_s16(v_dst0, v_delta), 2); |
|
|
|
|
|
|
|
|
|
v_src00 = vshrq_n_s32(vld1q_s32(S0 + x + 8), 4); |
|
|
|
|
v_src10 = vshrq_n_s32(vld1q_s32(S1 + x + 8), 4); |
|
|
|
|
v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 12), 4); |
|
|
|
|
v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 12), 4); |
|
|
|
|
|
|
|
|
|
v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01)); |
|
|
|
|
v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11)); |
|
|
|
|
|
|
|
|
|
int16x8_t v_dst1 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1), |
|
|
|
|
vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1)); |
|
|
|
|
v_dst1 = vshrq_n_s16(vaddq_s16(v_dst1, v_delta), 2); |
|
|
|
|
|
|
|
|
|
vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_dst0), vqmovun_s16(v_dst1))); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
struct VResizeLinearVec_32f16u |
|
|
|
|
{ |
|
|
|
@ -1071,6 +1113,128 @@ struct VResizeCubicVec_32f |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
struct VResizeLanczos4Vec_32f16u |
|
|
|
|
{ |
|
|
|
|
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const |
|
|
|
|
{ |
|
|
|
|
const float** src = (const float**)_src; |
|
|
|
|
const float* beta = (const float*)_beta; |
|
|
|
|
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], |
|
|
|
|
*S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; |
|
|
|
|
ushort * dst = (ushort*)_dst; |
|
|
|
|
int x = 0; |
|
|
|
|
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), |
|
|
|
|
v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]), |
|
|
|
|
v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]), |
|
|
|
|
v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]); |
|
|
|
|
|
|
|
|
|
for( ; x <= width - 8; x += 8 ) |
|
|
|
|
{ |
|
|
|
|
float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), |
|
|
|
|
v_b1, vld1q_f32(S1 + x)), |
|
|
|
|
v_b2, vld1q_f32(S2 + x)), |
|
|
|
|
v_b3, vld1q_f32(S3 + x)); |
|
|
|
|
float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)), |
|
|
|
|
v_b5, vld1q_f32(S5 + x)), |
|
|
|
|
v_b6, vld1q_f32(S6 + x)), |
|
|
|
|
v_b7, vld1q_f32(S7 + x)); |
|
|
|
|
float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1); |
|
|
|
|
|
|
|
|
|
v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), |
|
|
|
|
v_b1, vld1q_f32(S1 + x + 4)), |
|
|
|
|
v_b2, vld1q_f32(S2 + x + 4)), |
|
|
|
|
v_b3, vld1q_f32(S3 + x + 4)); |
|
|
|
|
v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)), |
|
|
|
|
v_b5, vld1q_f32(S5 + x + 4)), |
|
|
|
|
v_b6, vld1q_f32(S6 + x + 4)), |
|
|
|
|
v_b7, vld1q_f32(S7 + x + 4)); |
|
|
|
|
v_dst1 = vaddq_f32(v_dst0, v_dst1); |
|
|
|
|
|
|
|
|
|
vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst)), |
|
|
|
|
vqmovn_u32(cv_vrndq_u32_f32(v_dst1)))); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
struct VResizeLanczos4Vec_32f16s |
|
|
|
|
{ |
|
|
|
|
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const |
|
|
|
|
{ |
|
|
|
|
const float** src = (const float**)_src; |
|
|
|
|
const float* beta = (const float*)_beta; |
|
|
|
|
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], |
|
|
|
|
*S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; |
|
|
|
|
short * dst = (short*)_dst; |
|
|
|
|
int x = 0; |
|
|
|
|
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), |
|
|
|
|
v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]), |
|
|
|
|
v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]), |
|
|
|
|
v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]); |
|
|
|
|
|
|
|
|
|
for( ; x <= width - 8; x += 8 ) |
|
|
|
|
{ |
|
|
|
|
float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), |
|
|
|
|
v_b1, vld1q_f32(S1 + x)), |
|
|
|
|
v_b2, vld1q_f32(S2 + x)), |
|
|
|
|
v_b3, vld1q_f32(S3 + x)); |
|
|
|
|
float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)), |
|
|
|
|
v_b5, vld1q_f32(S5 + x)), |
|
|
|
|
v_b6, vld1q_f32(S6 + x)), |
|
|
|
|
v_b7, vld1q_f32(S7 + x)); |
|
|
|
|
float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1); |
|
|
|
|
|
|
|
|
|
v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), |
|
|
|
|
v_b1, vld1q_f32(S1 + x + 4)), |
|
|
|
|
v_b2, vld1q_f32(S2 + x + 4)), |
|
|
|
|
v_b3, vld1q_f32(S3 + x + 4)); |
|
|
|
|
v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)), |
|
|
|
|
v_b5, vld1q_f32(S5 + x + 4)), |
|
|
|
|
v_b6, vld1q_f32(S6 + x + 4)), |
|
|
|
|
v_b7, vld1q_f32(S7 + x + 4)); |
|
|
|
|
v_dst1 = vaddq_f32(v_dst0, v_dst1); |
|
|
|
|
|
|
|
|
|
vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst)), |
|
|
|
|
vqmovn_s32(cv_vrndq_s32_f32(v_dst1)))); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
struct VResizeLanczos4Vec_32f |
|
|
|
|
{ |
|
|
|
|
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const |
|
|
|
|
{ |
|
|
|
|
const float** src = (const float**)_src; |
|
|
|
|
const float* beta = (const float*)_beta; |
|
|
|
|
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], |
|
|
|
|
*S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; |
|
|
|
|
float* dst = (float*)_dst; |
|
|
|
|
int x = 0; |
|
|
|
|
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), |
|
|
|
|
v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]), |
|
|
|
|
v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]), |
|
|
|
|
v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]); |
|
|
|
|
|
|
|
|
|
for( ; x <= width - 4; x += 4 ) |
|
|
|
|
{ |
|
|
|
|
float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), |
|
|
|
|
v_b1, vld1q_f32(S1 + x)), |
|
|
|
|
v_b2, vld1q_f32(S2 + x)), |
|
|
|
|
v_b3, vld1q_f32(S3 + x)); |
|
|
|
|
float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)), |
|
|
|
|
v_b5, vld1q_f32(S5 + x)), |
|
|
|
|
v_b6, vld1q_f32(S6 + x)), |
|
|
|
|
v_b7, vld1q_f32(S7 + x)); |
|
|
|
|
vst1q_f32(dst + x, vaddq_f32(v_dst0, v_dst1)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return x; |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
|
|
typedef VResizeNoVec VResizeLinearVec_32s8u; |
|
|
|
@ -1083,6 +1247,10 @@ typedef VResizeNoVec VResizeCubicVec_32f16u; |
|
|
|
|
typedef VResizeNoVec VResizeCubicVec_32f16s; |
|
|
|
|
typedef VResizeNoVec VResizeCubicVec_32f; |
|
|
|
|
|
|
|
|
|
typedef VResizeNoVec VResizeLanczos4Vec_32f16u; |
|
|
|
|
typedef VResizeNoVec VResizeLanczos4Vec_32f16s; |
|
|
|
|
typedef VResizeNoVec VResizeLanczos4Vec_32f; |
|
|
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
typedef HResizeNoVec HResizeLinearVec_8u32s; |
|
|
|
@ -1611,6 +1779,107 @@ private: |
|
|
|
|
int cn, step; |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
class ResizeAreaFastVec_SIMD_16s |
|
|
|
|
{ |
|
|
|
|
public: |
|
|
|
|
ResizeAreaFastVec_SIMD_16s(int _cn, int _step) : |
|
|
|
|
cn(_cn), step(_step) |
|
|
|
|
{ |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
int operator() (const short * S, short * D, int w) const |
|
|
|
|
{ |
|
|
|
|
int dx = 0; |
|
|
|
|
const short * S0 = S, * S1 = (const short *)((const uchar *)(S0) + step); |
|
|
|
|
|
|
|
|
|
int32x4_t v_2 = vdupq_n_s32(2); |
|
|
|
|
|
|
|
|
|
if (cn == 1) |
|
|
|
|
{ |
|
|
|
|
for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) |
|
|
|
|
{ |
|
|
|
|
int16x8x2_t v_row0 = vld2q_s16(S0), v_row1 = vld2q_s16(S1); |
|
|
|
|
|
|
|
|
|
int32x4_t v_dst0 = vaddl_s16(vget_low_s16(v_row0.val[0]), vget_low_s16(v_row0.val[1])); |
|
|
|
|
v_dst0 = vaddq_s32(v_dst0, vaddl_s16(vget_low_s16(v_row1.val[0]), vget_low_s16(v_row1.val[1]))); |
|
|
|
|
v_dst0 = vshrq_n_s32(vaddq_s32(v_dst0, v_2), 2); |
|
|
|
|
|
|
|
|
|
int32x4_t v_dst1 = vaddl_s16(vget_high_s16(v_row0.val[0]), vget_high_s16(v_row0.val[1])); |
|
|
|
|
v_dst1 = vaddq_s32(v_dst1, vaddl_s16(vget_high_s16(v_row1.val[0]), vget_high_s16(v_row1.val[1]))); |
|
|
|
|
v_dst1 = vshrq_n_s32(vaddq_s32(v_dst1, v_2), 2); |
|
|
|
|
|
|
|
|
|
vst1q_s16(D, vcombine_s16(vmovn_s32(v_dst0), vmovn_s32(v_dst1))); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
else if (cn == 4) |
|
|
|
|
{ |
|
|
|
|
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) |
|
|
|
|
{ |
|
|
|
|
int16x8_t v_row0 = vld1q_s16(S0), v_row1 = vld1q_s16(S1); |
|
|
|
|
int32x4_t v_dst = vaddq_s32(vaddl_s16(vget_low_s16(v_row0), vget_high_s16(v_row0)), |
|
|
|
|
vaddl_s16(vget_low_s16(v_row1), vget_high_s16(v_row1))); |
|
|
|
|
vst1_s16(D, vmovn_s32(vshrq_n_s32(vaddq_s32(v_dst, v_2), 2))); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return dx; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
private: |
|
|
|
|
int cn, step; |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
struct ResizeAreaFastVec_SIMD_32f |
|
|
|
|
{ |
|
|
|
|
ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) : |
|
|
|
|
scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step) |
|
|
|
|
{ |
|
|
|
|
fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
int operator() (const float * S, float * D, int w) const |
|
|
|
|
{ |
|
|
|
|
if (!fast_mode) |
|
|
|
|
return 0; |
|
|
|
|
|
|
|
|
|
const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step); |
|
|
|
|
int dx = 0; |
|
|
|
|
|
|
|
|
|
float32x4_t v_025 = vdupq_n_f32(0.25f); |
|
|
|
|
|
|
|
|
|
if (cn == 1) |
|
|
|
|
{ |
|
|
|
|
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) |
|
|
|
|
{ |
|
|
|
|
float32x4x2_t v_row0 = vld2q_f32(S0), v_row1 = vld2q_f32(S1); |
|
|
|
|
|
|
|
|
|
float32x4_t v_dst0 = vaddq_f32(v_row0.val[0], v_row0.val[1]); |
|
|
|
|
float32x4_t v_dst1 = vaddq_f32(v_row1.val[0], v_row1.val[1]); |
|
|
|
|
|
|
|
|
|
vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025)); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
else if (cn == 4) |
|
|
|
|
{ |
|
|
|
|
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) |
|
|
|
|
{ |
|
|
|
|
float32x4_t v_dst0 = vaddq_f32(vld1q_f32(S0), vld1q_f32(S0 + 4)); |
|
|
|
|
float32x4_t v_dst1 = vaddq_f32(vld1q_f32(S1), vld1q_f32(S1 + 4)); |
|
|
|
|
|
|
|
|
|
vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025)); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return dx; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
private: |
|
|
|
|
int scale_x, scale_y; |
|
|
|
|
int cn; |
|
|
|
|
bool fast_mode; |
|
|
|
|
int step; |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
#elif CV_SSE2 |
|
|
|
|
|
|
|
|
|
class ResizeAreaFastVec_SIMD_8u |
|
|
|
@ -1800,9 +2069,16 @@ private: |
|
|
|
|
bool use_simd; |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
typedef ResizeAreaFastNoVec<short, short> ResizeAreaFastVec_SIMD_16s; |
|
|
|
|
typedef ResizeAreaFastNoVec<float, float> ResizeAreaFastVec_SIMD_32f; |
|
|
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
|
|
typedef ResizeAreaFastNoVec<uchar, uchar> ResizeAreaFastVec_SIMD_8u; |
|
|
|
|
typedef ResizeAreaFastNoVec<ushort, ushort> ResizeAreaFastVec_SIMD_16u; |
|
|
|
|
typedef ResizeAreaFastNoVec<short, short> ResizeAreaFastVec_SIMD_16s; |
|
|
|
|
typedef ResizeAreaFastNoVec<float, float> ResizeAreaFastVec_SIMD_32f; |
|
|
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
template<typename T, typename SIMDVecOp> |
|
|
|
@ -2626,14 +2902,14 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize, |
|
|
|
|
0, |
|
|
|
|
resizeGeneric_<HResizeLanczos4<ushort, float, float>, |
|
|
|
|
VResizeLanczos4<ushort, float, float, Cast<float, ushort>, |
|
|
|
|
VResizeNoVec> >, |
|
|
|
|
VResizeLanczos4Vec_32f16u> >, |
|
|
|
|
resizeGeneric_<HResizeLanczos4<short, float, float>, |
|
|
|
|
VResizeLanczos4<short, float, float, Cast<float, short>, |
|
|
|
|
VResizeNoVec> >, |
|
|
|
|
VResizeLanczos4Vec_32f16s> >, |
|
|
|
|
0, |
|
|
|
|
resizeGeneric_<HResizeLanczos4<float, float, float>, |
|
|
|
|
VResizeLanczos4<float, float, float, Cast<float, float>, |
|
|
|
|
VResizeNoVec> >, |
|
|
|
|
VResizeLanczos4Vec_32f> >, |
|
|
|
|
resizeGeneric_<HResizeLanczos4<double, double, float>, |
|
|
|
|
VResizeLanczos4<double, double, float, Cast<double, double>, |
|
|
|
|
VResizeNoVec> >, |
|
|
|
@ -2645,9 +2921,9 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize, |
|
|
|
|
resizeAreaFast_<uchar, int, ResizeAreaFastVec<uchar, ResizeAreaFastVec_SIMD_8u> >, |
|
|
|
|
0, |
|
|
|
|
resizeAreaFast_<ushort, float, ResizeAreaFastVec<ushort, ResizeAreaFastVec_SIMD_16u> >, |
|
|
|
|
resizeAreaFast_<short, float, ResizeAreaFastVec<short, ResizeAreaFastNoVec<short, float> > >, |
|
|
|
|
resizeAreaFast_<short, float, ResizeAreaFastVec<short, ResizeAreaFastVec_SIMD_16s> >, |
|
|
|
|
0, |
|
|
|
|
resizeAreaFast_<float, float, ResizeAreaFastNoVec<float, float> >, |
|
|
|
|
resizeAreaFast_<float, float, ResizeAreaFastVec_SIMD_32f>, |
|
|
|
|
resizeAreaFast_<double, double, ResizeAreaFastNoVec<double, double> >, |
|
|
|
|
0 |
|
|
|
|
}; |
|
|
|
@ -4281,17 +4557,59 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, |
|
|
|
|
float* dst2f = dstmap2.ptr<float>(y); |
|
|
|
|
short* dst1 = (short*)dst1f; |
|
|
|
|
ushort* dst2 = (ushort*)dst2f; |
|
|
|
|
x = 0; |
|
|
|
|
|
|
|
|
|
if( m1type == CV_32FC1 && dstm1type == CV_16SC2 ) |
|
|
|
|
{ |
|
|
|
|
if( nninterpolate ) |
|
|
|
|
for( x = 0; x < size.width; x++ ) |
|
|
|
|
{ |
|
|
|
|
#if CV_NEON |
|
|
|
|
for( ; x <= size.width - 8; x += 8 ) |
|
|
|
|
{ |
|
|
|
|
int16x8x2_t v_dst; |
|
|
|
|
v_dst.val[0] = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))), |
|
|
|
|
vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4)))); |
|
|
|
|
v_dst.val[1] = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src2f + x))), |
|
|
|
|
vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src2f + x + 4)))); |
|
|
|
|
|
|
|
|
|
vst2q_s16(dst1 + (x << 1), v_dst); |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
for( ; x < size.width; x++ ) |
|
|
|
|
{ |
|
|
|
|
dst1[x*2] = saturate_cast<short>(src1f[x]); |
|
|
|
|
dst1[x*2+1] = saturate_cast<short>(src2f[x]); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
for( x = 0; x < size.width; x++ ) |
|
|
|
|
{ |
|
|
|
|
#if CV_NEON |
|
|
|
|
float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE); |
|
|
|
|
int32x4_t v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1); |
|
|
|
|
|
|
|
|
|
for( ; x <= size.width - 8; x += 8 ) |
|
|
|
|
{ |
|
|
|
|
int32x4_t v_ix0 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src1f + x), v_scale)); |
|
|
|
|
int32x4_t v_ix1 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src1f + x + 4), v_scale)); |
|
|
|
|
int32x4_t v_iy0 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src2f + x), v_scale)); |
|
|
|
|
int32x4_t v_iy1 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src2f + x + 4), v_scale)); |
|
|
|
|
|
|
|
|
|
int16x8x2_t v_dst; |
|
|
|
|
v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_ix0, INTER_BITS)), |
|
|
|
|
vqmovn_s32(vshrq_n_s32(v_ix1, INTER_BITS))); |
|
|
|
|
v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_iy0, INTER_BITS)), |
|
|
|
|
vqmovn_s32(vshrq_n_s32(v_iy1, INTER_BITS))); |
|
|
|
|
|
|
|
|
|
vst2q_s16(dst1 + (x << 1), v_dst); |
|
|
|
|
|
|
|
|
|
uint16x4_t v_dst0 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy0, v_mask), INTER_BITS), |
|
|
|
|
vandq_s32(v_ix0, v_mask))); |
|
|
|
|
uint16x4_t v_dst1 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy1, v_mask), INTER_BITS), |
|
|
|
|
vandq_s32(v_ix1, v_mask))); |
|
|
|
|
vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1)); |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
for( ; x < size.width; x++ ) |
|
|
|
|
{ |
|
|
|
|
int ix = saturate_cast<int>(src1f[x]*INTER_TAB_SIZE); |
|
|
|
|
int iy = saturate_cast<int>(src2f[x]*INTER_TAB_SIZE); |
|
|
|
@ -4299,17 +4617,53 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, |
|
|
|
|
dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS); |
|
|
|
|
dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1))); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
else if( m1type == CV_32FC2 && dstm1type == CV_16SC2 ) |
|
|
|
|
{ |
|
|
|
|
if( nninterpolate ) |
|
|
|
|
for( x = 0; x < size.width; x++ ) |
|
|
|
|
{ |
|
|
|
|
#if CV_NEON |
|
|
|
|
for( ; x <= (size.width << 1) - 8; x += 8 ) |
|
|
|
|
vst1q_s16(dst1 + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))), |
|
|
|
|
vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4))))); |
|
|
|
|
#endif |
|
|
|
|
for( ; x < size.width; x++ ) |
|
|
|
|
{ |
|
|
|
|
dst1[x*2] = saturate_cast<short>(src1f[x*2]); |
|
|
|
|
dst1[x*2+1] = saturate_cast<short>(src1f[x*2+1]); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
for( x = 0; x < size.width; x++ ) |
|
|
|
|
{ |
|
|
|
|
#if CV_NEON |
|
|
|
|
float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE); |
|
|
|
|
int32x4_t v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1); |
|
|
|
|
|
|
|
|
|
for( ; x <= size.width - 8; x += 8 ) |
|
|
|
|
{ |
|
|
|
|
float32x4x2_t v_src0 = vld2q_f32(src1f + (x << 1)), v_src1 = vld2q_f32(src1f + (x << 1) + 8); |
|
|
|
|
int32x4_t v_ix0 = cv_vrndq_s32_f32(vmulq_f32(v_src0.val[0], v_scale)); |
|
|
|
|
int32x4_t v_ix1 = cv_vrndq_s32_f32(vmulq_f32(v_src1.val[0], v_scale)); |
|
|
|
|
int32x4_t v_iy0 = cv_vrndq_s32_f32(vmulq_f32(v_src0.val[1], v_scale)); |
|
|
|
|
int32x4_t v_iy1 = cv_vrndq_s32_f32(vmulq_f32(v_src1.val[1], v_scale)); |
|
|
|
|
|
|
|
|
|
int16x8x2_t v_dst; |
|
|
|
|
v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_ix0, INTER_BITS)), |
|
|
|
|
vqmovn_s32(vshrq_n_s32(v_ix1, INTER_BITS))); |
|
|
|
|
v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_iy0, INTER_BITS)), |
|
|
|
|
vqmovn_s32(vshrq_n_s32(v_iy1, INTER_BITS))); |
|
|
|
|
|
|
|
|
|
vst2q_s16(dst1 + (x << 1), v_dst); |
|
|
|
|
|
|
|
|
|
uint16x4_t v_dst0 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy0, v_mask), INTER_BITS), |
|
|
|
|
vandq_s32(v_ix0, v_mask))); |
|
|
|
|
uint16x4_t v_dst1 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy1, v_mask), INTER_BITS), |
|
|
|
|
vandq_s32(v_ix1, v_mask))); |
|
|
|
|
vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1)); |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
for( ; x < size.width; x++ ) |
|
|
|
|
{ |
|
|
|
|
int ix = saturate_cast<int>(src1f[x*2]*INTER_TAB_SIZE); |
|
|
|
|
int iy = saturate_cast<int>(src1f[x*2+1]*INTER_TAB_SIZE); |
|
|
|
@ -4317,10 +4671,44 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, |
|
|
|
|
dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS); |
|
|
|
|
dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1))); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
else if( m1type == CV_16SC2 && dstm1type == CV_32FC1 ) |
|
|
|
|
{ |
|
|
|
|
for( x = 0; x < size.width; x++ ) |
|
|
|
|
#if CV_NEON |
|
|
|
|
uint16x8_t v_mask2 = vdupq_n_u16(INTER_TAB_SIZE2-1); |
|
|
|
|
uint32x4_t v_zero = vdupq_n_u32(0u), v_mask = vdupq_n_u32(INTER_TAB_SIZE-1); |
|
|
|
|
float32x4_t v_scale = vdupq_n_f32(scale); |
|
|
|
|
|
|
|
|
|
for( ; x <= size.width - 8; x += 8) |
|
|
|
|
{ |
|
|
|
|
uint32x4_t v_fxy1, v_fxy2; |
|
|
|
|
if (src2) |
|
|
|
|
{ |
|
|
|
|
uint16x8_t v_src2 = vandq_u16(vld1q_u16(src2 + x), v_mask2); |
|
|
|
|
v_fxy1 = vmovl_u16(vget_low_u16(v_src2)); |
|
|
|
|
v_fxy2 = vmovl_u16(vget_high_u16(v_src2)); |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
v_fxy1 = v_fxy2 = v_zero; |
|
|
|
|
|
|
|
|
|
int16x8x2_t v_src = vld2q_s16(src1 + (x << 1)); |
|
|
|
|
float32x4_t v_dst1 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[0]))), |
|
|
|
|
v_scale, vcvtq_f32_u32(vandq_u32(v_fxy1, v_mask))); |
|
|
|
|
float32x4_t v_dst2 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[1]))), |
|
|
|
|
v_scale, vcvtq_f32_u32(vshrq_n_u32(v_fxy1, INTER_BITS))); |
|
|
|
|
vst1q_f32(dst1f + x, v_dst1); |
|
|
|
|
vst1q_f32(dst2f + x, v_dst2); |
|
|
|
|
|
|
|
|
|
v_dst1 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[0]))), |
|
|
|
|
v_scale, vcvtq_f32_u32(vandq_u32(v_fxy2, v_mask))); |
|
|
|
|
v_dst2 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[1]))), |
|
|
|
|
v_scale, vcvtq_f32_u32(vshrq_n_u32(v_fxy2, INTER_BITS))); |
|
|
|
|
vst1q_f32(dst1f + x + 4, v_dst1); |
|
|
|
|
vst1q_f32(dst2f + x + 4, v_dst2); |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
for( ; x < size.width; x++ ) |
|
|
|
|
{ |
|
|
|
|
int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1) : 0; |
|
|
|
|
dst1f[x] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale; |
|
|
|
@ -4329,7 +4717,39 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, |
|
|
|
|
} |
|
|
|
|
else if( m1type == CV_16SC2 && dstm1type == CV_32FC2 ) |
|
|
|
|
{ |
|
|
|
|
for( x = 0; x < size.width; x++ ) |
|
|
|
|
#if CV_NEON |
|
|
|
|
int16x8_t v_mask2 = vdupq_n_s16(INTER_TAB_SIZE2-1); |
|
|
|
|
int32x4_t v_zero = vdupq_n_s32(0), v_mask = vdupq_n_s32(INTER_TAB_SIZE-1); |
|
|
|
|
float32x4_t v_scale = vdupq_n_f32(scale); |
|
|
|
|
|
|
|
|
|
for( ; x <= size.width - 8; x += 8) |
|
|
|
|
{ |
|
|
|
|
int32x4_t v_fxy1, v_fxy2; |
|
|
|
|
if (src2) |
|
|
|
|
{ |
|
|
|
|
int16x8_t v_src2 = vandq_s16(vld1q_s16((short *)src2 + x), v_mask2); |
|
|
|
|
v_fxy1 = vmovl_s16(vget_low_s16(v_src2)); |
|
|
|
|
v_fxy2 = vmovl_s16(vget_high_s16(v_src2)); |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
v_fxy1 = v_fxy2 = v_zero; |
|
|
|
|
|
|
|
|
|
int16x8x2_t v_src = vld2q_s16(src1 + (x << 1)); |
|
|
|
|
float32x4x2_t v_dst; |
|
|
|
|
v_dst.val[0] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[0]))), |
|
|
|
|
v_scale, vcvtq_f32_s32(vandq_s32(v_fxy1, v_mask))); |
|
|
|
|
v_dst.val[1] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[1]))), |
|
|
|
|
v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy1, INTER_BITS))); |
|
|
|
|
vst2q_f32(dst1f + (x << 1), v_dst); |
|
|
|
|
|
|
|
|
|
v_dst.val[0] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[0]))), |
|
|
|
|
v_scale, vcvtq_f32_s32(vandq_s32(v_fxy2, v_mask))); |
|
|
|
|
v_dst.val[1] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[1]))), |
|
|
|
|
v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy2, INTER_BITS))); |
|
|
|
|
vst2q_f32(dst1f + (x << 1) + 8, v_dst); |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
for( ; x < size.width; x++ ) |
|
|
|
|
{ |
|
|
|
|
int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1): 0; |
|
|
|
|
dst1f[x*2] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale; |
|
|
|
@ -4389,13 +4809,29 @@ public: |
|
|
|
|
int Y0 = saturate_cast<int>((M[4]*(y + y1) + M[5])*AB_SCALE) + round_delta; |
|
|
|
|
|
|
|
|
|
if( interpolation == INTER_NEAREST ) |
|
|
|
|
for( x1 = 0; x1 < bw; x1++ ) |
|
|
|
|
{ |
|
|
|
|
x1 = 0; |
|
|
|
|
#if CV_NEON |
|
|
|
|
int32x4_t v_X0 = vdupq_n_s32(X0), v_Y0 = vdupq_n_s32(Y0); |
|
|
|
|
for( ; x1 <= bw - 8; x1 += 8 ) |
|
|
|
|
{ |
|
|
|
|
int16x8x2_t v_dst; |
|
|
|
|
v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1)), AB_BITS)), |
|
|
|
|
vqmovn_s32(vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1 + 4)), AB_BITS))); |
|
|
|
|
v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1)), AB_BITS)), |
|
|
|
|
vqmovn_s32(vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1 + 4)), AB_BITS))); |
|
|
|
|
|
|
|
|
|
vst2q_s16(xy + (x1 << 1), v_dst); |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
for( ; x1 < bw; x1++ ) |
|
|
|
|
{ |
|
|
|
|
int X = (X0 + adelta[x+x1]) >> AB_BITS; |
|
|
|
|
int Y = (Y0 + bdelta[x+x1]) >> AB_BITS; |
|
|
|
|
xy[x1*2] = saturate_cast<short>(X); |
|
|
|
|
xy[x1*2+1] = saturate_cast<short>(Y); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
{ |
|
|
|
|
short* alpha = A + y1*bw; |
|
|
|
@ -4433,6 +4869,27 @@ public: |
|
|
|
|
_mm_storeu_si128((__m128i*)(alpha + x1), fx_); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#elif CV_NEON |
|
|
|
|
int32x4_t v__X0 = vdupq_n_s32(X0), v__Y0 = vdupq_n_s32(Y0), v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1); |
|
|
|
|
for( ; x1 <= bw - 8; x1 += 8 ) |
|
|
|
|
{ |
|
|
|
|
int32x4_t v_X0 = vshrq_n_s32(vaddq_s32(v__X0, vld1q_s32(adelta + x + x1)), AB_BITS - INTER_BITS); |
|
|
|
|
int32x4_t v_Y0 = vshrq_n_s32(vaddq_s32(v__Y0, vld1q_s32(bdelta + x + x1)), AB_BITS - INTER_BITS); |
|
|
|
|
int32x4_t v_X1 = vshrq_n_s32(vaddq_s32(v__X0, vld1q_s32(adelta + x + x1 + 4)), AB_BITS - INTER_BITS); |
|
|
|
|
int32x4_t v_Y1 = vshrq_n_s32(vaddq_s32(v__Y0, vld1q_s32(bdelta + x + x1 + 4)), AB_BITS - INTER_BITS); |
|
|
|
|
|
|
|
|
|
int16x8x2_t v_xy; |
|
|
|
|
v_xy.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_X0, INTER_BITS)), vqmovn_s32(vshrq_n_s32(v_X1, INTER_BITS))); |
|
|
|
|
v_xy.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_Y0, INTER_BITS)), vqmovn_s32(vshrq_n_s32(v_Y1, INTER_BITS))); |
|
|
|
|
|
|
|
|
|
vst2q_s16(xy + (x1 << 1), v_xy); |
|
|
|
|
|
|
|
|
|
int16x4_t v_alpha0 = vmovn_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_Y0, v_mask), INTER_BITS), |
|
|
|
|
vandq_s32(v_X0, v_mask))); |
|
|
|
|
int16x4_t v_alpha1 = vmovn_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_Y1, v_mask), INTER_BITS), |
|
|
|
|
vandq_s32(v_X1, v_mask))); |
|
|
|
|
vst1q_s16(alpha + x1, vcombine_s16(v_alpha0, v_alpha1)); |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
for( ; x1 < bw; x1++ ) |
|
|
|
|
{ |
|
|
|
|