|
|
|
@ -87,13 +87,13 @@ template<typename T1, typename T2> int PyrUpVecV(T1**, T2**, int) { return 0; } |
|
|
|
|
template<> int PyrDownVecH<uchar, int, 1>(const uchar* src, int* row, int width) |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
const uchar *src0 = src, *src2 = src + 2, *src4 = src + 3; |
|
|
|
|
const uchar *src01 = src, *src23 = src + 2, *src4 = src + 3; |
|
|
|
|
|
|
|
|
|
v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001)); |
|
|
|
|
v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); |
|
|
|
|
for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src0 += v_int16::nlanes, src2 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes) |
|
|
|
|
v_store(row, v_dotprod(v_reinterpret_as_s16(vx_load_expand(src0)), v_1_4) + |
|
|
|
|
v_dotprod(v_reinterpret_as_s16(vx_load_expand(src2)), v_6_4) + |
|
|
|
|
for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes) |
|
|
|
|
v_store(row, v_dotprod(v_reinterpret_as_s16(vx_load_expand(src01)), v_1_4) + |
|
|
|
|
v_dotprod(v_reinterpret_as_s16(vx_load_expand(src23)), v_6_4) + |
|
|
|
|
(v_reinterpret_as_s32(vx_load_expand(src4)) >> 16)); |
|
|
|
|
vx_cleanup(); |
|
|
|
|
|
|
|
|
@ -102,13 +102,13 @@ template<> int PyrDownVecH<uchar, int, 1>(const uchar* src, int* row, int width) |
|
|
|
|
template<> int PyrDownVecH<uchar, int, 2>(const uchar* src, int* row, int width) |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
const uchar *src0 = src, *src2 = src + 4, *src4 = src + 6; |
|
|
|
|
const uchar *src01 = src, *src23 = src + 4, *src4 = src + 6; |
|
|
|
|
|
|
|
|
|
v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001)); |
|
|
|
|
v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); |
|
|
|
|
for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src0 += v_int16::nlanes, src2 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes) |
|
|
|
|
v_store(row, v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(vx_load_expand(src0))), v_1_4) + |
|
|
|
|
v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(vx_load_expand(src2))), v_6_4) + |
|
|
|
|
for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes) |
|
|
|
|
v_store(row, v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(vx_load_expand(src01))), v_1_4) + |
|
|
|
|
v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(vx_load_expand(src23))), v_6_4) + |
|
|
|
|
(v_reinterpret_as_s32(v_interleave_pairs(vx_load_expand(src4))) >> 16)); |
|
|
|
|
vx_cleanup(); |
|
|
|
|
|
|
|
|
@ -150,13 +150,13 @@ template<> int PyrDownVecH<uchar, int, 3>(const uchar* src, int* row, int width) |
|
|
|
|
template<> int PyrDownVecH<uchar, int, 4>(const uchar* src, int* row, int width) |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
const uchar *src0 = src, *src2 = src + 8, *src4 = src + 12; |
|
|
|
|
const uchar *src01 = src, *src23 = src + 8, *src4 = src + 12; |
|
|
|
|
|
|
|
|
|
v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001)); |
|
|
|
|
v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); |
|
|
|
|
for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src0 += v_int16::nlanes, src2 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes) |
|
|
|
|
v_store(row, v_dotprod(v_interleave_quads(v_reinterpret_as_s16(vx_load_expand(src0))), v_1_4) + |
|
|
|
|
v_dotprod(v_interleave_quads(v_reinterpret_as_s16(vx_load_expand(src2))), v_6_4) + |
|
|
|
|
for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes) |
|
|
|
|
v_store(row, v_dotprod(v_interleave_quads(v_reinterpret_as_s16(vx_load_expand(src01))), v_1_4) + |
|
|
|
|
v_dotprod(v_interleave_quads(v_reinterpret_as_s16(vx_load_expand(src23))), v_6_4) + |
|
|
|
|
(v_reinterpret_as_s32(v_interleave_quads(vx_load_expand(src4))) >> 16)); |
|
|
|
|
vx_cleanup(); |
|
|
|
|
|
|
|
|
@ -166,13 +166,13 @@ template<> int PyrDownVecH<uchar, int, 4>(const uchar* src, int* row, int width) |
|
|
|
|
template<> int PyrDownVecH<short, int, 1>(const short* src, int* row, int width) |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
const short *src0 = src, *src2 = src + 2, *src4 = src + 3; |
|
|
|
|
const short *src01 = src, *src23 = src + 2, *src4 = src + 3; |
|
|
|
|
|
|
|
|
|
v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001)); |
|
|
|
|
v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); |
|
|
|
|
for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src0 += v_int16::nlanes, src2 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes) |
|
|
|
|
v_store(row, v_dotprod(vx_load(src0), v_1_4) + |
|
|
|
|
v_dotprod(vx_load(src2), v_6_4) + |
|
|
|
|
for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes) |
|
|
|
|
v_store(row, v_dotprod(vx_load(src01), v_1_4) + |
|
|
|
|
v_dotprod(vx_load(src23), v_6_4) + |
|
|
|
|
(v_reinterpret_as_s32(vx_load(src4)) >> 16)); |
|
|
|
|
vx_cleanup(); |
|
|
|
|
|
|
|
|
@ -181,13 +181,13 @@ template<> int PyrDownVecH<short, int, 1>(const short* src, int* row, int width) |
|
|
|
|
template<> int PyrDownVecH<short, int, 2>(const short* src, int* row, int width) |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
const short *src0 = src, *src2 = src + 4, *src4 = src + 6; |
|
|
|
|
const short *src01 = src, *src23 = src + 4, *src4 = src + 6; |
|
|
|
|
|
|
|
|
|
v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001)); |
|
|
|
|
v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); |
|
|
|
|
for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src0 += v_int16::nlanes, src2 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes) |
|
|
|
|
v_store(row, v_dotprod(v_interleave_pairs(vx_load(src0)), v_1_4) + |
|
|
|
|
v_dotprod(v_interleave_pairs(vx_load(src2)), v_6_4) + |
|
|
|
|
for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes) |
|
|
|
|
v_store(row, v_dotprod(v_interleave_pairs(vx_load(src01)), v_1_4) + |
|
|
|
|
v_dotprod(v_interleave_pairs(vx_load(src23)), v_6_4) + |
|
|
|
|
(v_reinterpret_as_s32(v_interleave_pairs(vx_load(src4))) >> 16)); |
|
|
|
|
vx_cleanup(); |
|
|
|
|
|
|
|
|
@ -247,15 +247,15 @@ template<> int PyrDownVecH<short, int, 4>(const short* src, int* row, int width) |
|
|
|
|
template<> int PyrDownVecH<ushort, int, 1>(const ushort* src, int* row, int width) |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
const ushort *src0 = src, *src2 = src + 2, *src4 = src + 3; |
|
|
|
|
const ushort *src01 = src, *src23 = src + 2, *src4 = src + 3; |
|
|
|
|
|
|
|
|
|
v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001)); |
|
|
|
|
v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); |
|
|
|
|
v_uint16 v_half = vx_setall_u16(0x8000); |
|
|
|
|
v_int32 v_half15 = vx_setall_s32(0x00078000); |
|
|
|
|
for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src0 += v_int16::nlanes, src2 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes) |
|
|
|
|
v_store(row, v_dotprod(v_reinterpret_as_s16(v_sub_wrap(vx_load(src0), v_half)), v_1_4) + |
|
|
|
|
v_dotprod(v_reinterpret_as_s16(v_sub_wrap(vx_load(src2), v_half)), v_6_4) + |
|
|
|
|
for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes) |
|
|
|
|
v_store(row, v_dotprod(v_reinterpret_as_s16(v_sub_wrap(vx_load(src01), v_half)), v_1_4) + |
|
|
|
|
v_dotprod(v_reinterpret_as_s16(v_sub_wrap(vx_load(src23), v_half)), v_6_4) + |
|
|
|
|
v_reinterpret_as_s32(v_reinterpret_as_u32(vx_load(src4)) >> 16) + v_half15); |
|
|
|
|
vx_cleanup(); |
|
|
|
|
|
|
|
|
@ -264,15 +264,15 @@ template<> int PyrDownVecH<ushort, int, 1>(const ushort* src, int* row, int widt |
|
|
|
|
template<> int PyrDownVecH<ushort, int, 2>(const ushort* src, int* row, int width) |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
const ushort *src0 = src, *src2 = src + 4, *src4 = src + 6; |
|
|
|
|
const ushort *src01 = src, *src23 = src + 4, *src4 = src + 6; |
|
|
|
|
|
|
|
|
|
v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001)); |
|
|
|
|
v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006)); |
|
|
|
|
v_uint16 v_half = vx_setall_u16(0x8000); |
|
|
|
|
v_int32 v_half15 = vx_setall_s32(0x00078000); |
|
|
|
|
for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src0 += v_int16::nlanes, src2 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes) |
|
|
|
|
v_store(row, v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(v_sub_wrap(vx_load(src0), v_half))), v_1_4) + |
|
|
|
|
v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(v_sub_wrap(vx_load(src2), v_half))), v_6_4) + |
|
|
|
|
for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes) |
|
|
|
|
v_store(row, v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(v_sub_wrap(vx_load(src01), v_half))), v_1_4) + |
|
|
|
|
v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(v_sub_wrap(vx_load(src23), v_half))), v_6_4) + |
|
|
|
|
v_reinterpret_as_s32(v_reinterpret_as_u32(v_interleave_pairs(vx_load(src4))) >> 16) + v_half15); |
|
|
|
|
vx_cleanup(); |
|
|
|
|
|
|
|
|
@ -344,15 +344,15 @@ template<> int PyrDownVecH<ushort, int, 4>(const ushort* src, int* row, int widt |
|
|
|
|
template<> int PyrDownVecH<float, float, 1>(const float* src, float* row, int width) |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
const float *src0 = src, *src2 = src + 2, *src4 = src + 4; |
|
|
|
|
const float *src01 = src, *src23 = src + 2, *src4 = src + 3; |
|
|
|
|
|
|
|
|
|
v_float32 _4 = vx_setall_f32(4.f), _6 = vx_setall_f32(6.f); |
|
|
|
|
for (; x <= width - v_float32::nlanes; x += v_float32::nlanes, src0 += 2*v_float32::nlanes, src2 += 2*v_float32::nlanes, src4 += 2*v_float32::nlanes, row+=v_float32::nlanes) |
|
|
|
|
for (; x <= width - v_float32::nlanes; x += v_float32::nlanes, src01 += 2*v_float32::nlanes, src23 += 2*v_float32::nlanes, src4 += 2*v_float32::nlanes, row+=v_float32::nlanes) |
|
|
|
|
{ |
|
|
|
|
v_float32 r0, r1, r2, r3, r4, rtmp; |
|
|
|
|
v_load_deinterleave(src0, r0, r1); |
|
|
|
|
v_load_deinterleave(src2, r2, r3); |
|
|
|
|
v_load_deinterleave(src4, r4, rtmp); |
|
|
|
|
v_load_deinterleave(src01, r0, r1); |
|
|
|
|
v_load_deinterleave(src23, r2, r3); |
|
|
|
|
v_load_deinterleave(src4, rtmp, r4); |
|
|
|
|
v_store(row, v_muladd(r2, _6, v_muladd(r1 + r3, _4, r0 + r4))); |
|
|
|
|
} |
|
|
|
|
vx_cleanup(); |
|
|
|
@ -362,14 +362,14 @@ template<> int PyrDownVecH<float, float, 1>(const float* src, float* row, int wi |
|
|
|
|
template<> int PyrDownVecH<float, float, 2>(const float* src, float* row, int width) |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
const float *src0 = src, *src2 = src + 4, *src4 = src + 6; |
|
|
|
|
const float *src01 = src, *src23 = src + 4, *src4 = src + 6; |
|
|
|
|
|
|
|
|
|
v_float32 _4 = vx_setall_f32(4.f), _6 = vx_setall_f32(6.f); |
|
|
|
|
for (; x <= width - 2*v_float32::nlanes; x += 2*v_float32::nlanes, src0 += 4*v_float32::nlanes, src2 += 4*v_float32::nlanes, src4 += 4*v_float32::nlanes, row += 2*v_float32::nlanes) |
|
|
|
|
for (; x <= width - 2*v_float32::nlanes; x += 2*v_float32::nlanes, src01 += 4*v_float32::nlanes, src23 += 4*v_float32::nlanes, src4 += 4*v_float32::nlanes, row += 2*v_float32::nlanes) |
|
|
|
|
{ |
|
|
|
|
v_float32 r0a, r0b, r1a, r1b, r2a, r2b, r3a, r3b, r4a, r4b, rtmpa, rtmpb; |
|
|
|
|
v_load_deinterleave(src0, r0a, r0b, r1a, r1b); |
|
|
|
|
v_load_deinterleave(src2, r2a, r2b, r3a, r3b); |
|
|
|
|
v_load_deinterleave(src01, r0a, r0b, r1a, r1b); |
|
|
|
|
v_load_deinterleave(src23, r2a, r2b, r3a, r3b); |
|
|
|
|
v_load_deinterleave(src4, rtmpa, rtmpb, r4a, r4b); |
|
|
|
|
v_store_interleave(row, v_muladd(r2a, _6, v_muladd(r1a + r3a, _4, r0a + r4a)), v_muladd(r2b, _6, v_muladd(r1b + r3b, _4, r0b + r4b))); |
|
|
|
|
} |
|
|
|
@ -430,15 +430,15 @@ template<> int PyrDownVecH<float, float, 4>(const float* src, float* row, int wi |
|
|
|
|
template<> int PyrDownVecH<double, double, 1>(const double* src, double* row, int width) |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
const double *src0 = src, *src2 = src + 2, *src4 = src + 4; |
|
|
|
|
const double *src01 = src, *src23 = src + 2, *src4 = src + 3; |
|
|
|
|
|
|
|
|
|
v_float64 _4 = vx_setall_f64(4.f), _6 = vx_setall_f64(6.f); |
|
|
|
|
for (; x <= width - v_float64::nlanes; x += v_float64::nlanes, src0 += 2*v_float64::nlanes, src2 += 2*v_float64::nlanes, src4 += 2*v_float64::nlanes, row += v_float64::nlanes) |
|
|
|
|
for (; x <= width - v_float64::nlanes; x += v_float64::nlanes, src01 += 2*v_float64::nlanes, src23 += 2*v_float64::nlanes, src4 += 2*v_float64::nlanes, row += v_float64::nlanes) |
|
|
|
|
{ |
|
|
|
|
v_float64 r0, r1, r2, r3, r4, rtmp; |
|
|
|
|
v_load_deinterleave(src0, r0, r1); |
|
|
|
|
v_load_deinterleave(src2, r2, r3); |
|
|
|
|
v_load_deinterleave(src4, r4, rtmp); |
|
|
|
|
v_load_deinterleave(src01, r0, r1); |
|
|
|
|
v_load_deinterleave(src23, r2, r3); |
|
|
|
|
v_load_deinterleave(src4, rtmp, r4); |
|
|
|
|
v_store(row, v_muladd(r2, _6, v_muladd(r1 + r3, _4, r0 + r4))); |
|
|
|
|
} |
|
|
|
|
vx_cleanup(); |
|
|
|
|