|
|
|
@ -1674,93 +1674,9 @@ struct HResizeLinearVecU8_X4 |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
else if(cn < 9) |
|
|
|
|
{ |
|
|
|
|
const int step = 8; |
|
|
|
|
const int len0 = xmax & -step; |
|
|
|
|
for( ; k <= (count - 2); k+=2 ) |
|
|
|
|
{ |
|
|
|
|
const uchar *S0 = src[k]; |
|
|
|
|
int *D0 = dst[k]; |
|
|
|
|
const uchar *S1 = src[k+1]; |
|
|
|
|
int *D1 = dst[k+1]; |
|
|
|
|
|
|
|
|
|
for( dx = 0; dx < len0; dx += cn ) |
|
|
|
|
{ |
|
|
|
|
v_int16x8 a0 = v_load(alpha+dx*2); |
|
|
|
|
v_int16x8 a1 = v_load(alpha+dx*2 + 8); |
|
|
|
|
v_uint16x8 s0, s1; |
|
|
|
|
v_zip(v_load_expand(S0+xofs[dx]), v_load_expand(S0+xofs[dx]+cn), s0, s1); |
|
|
|
|
v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(s0), a0)); |
|
|
|
|
v_store(&D0[dx+4], v_dotprod(v_reinterpret_as_s16(s1), a1)); |
|
|
|
|
v_zip(v_load_expand(S1+xofs[dx]), v_load_expand(S1+xofs[dx]+cn), s0, s1); |
|
|
|
|
v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(s0), a0)); |
|
|
|
|
v_store(&D1[dx+4], v_dotprod(v_reinterpret_as_s16(s1), a1)); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
for( ; k < count; k++ ) |
|
|
|
|
{ |
|
|
|
|
const uchar *S = src[k]; |
|
|
|
|
int *D = dst[k]; |
|
|
|
|
for( dx = 0; dx < len0; dx += cn ) |
|
|
|
|
{ |
|
|
|
|
v_int16x8 a0 = v_load(alpha+dx*2); |
|
|
|
|
v_int16x8 a1 = v_load(alpha+dx*2 + 8); |
|
|
|
|
v_uint16x8 s0, s1; |
|
|
|
|
v_zip(v_load_expand(S+xofs[dx]), v_load_expand(S+xofs[dx]+cn), s0, s1); |
|
|
|
|
v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(s0), a0)); |
|
|
|
|
v_store(&D[dx+4], v_dotprod(v_reinterpret_as_s16(s1), a1)); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
{ |
|
|
|
|
const int step = 16; |
|
|
|
|
const int len0 = (xmax - cn) & -step; |
|
|
|
|
for( ; k <= (count - 2); k+=2 ) |
|
|
|
|
{ |
|
|
|
|
const uchar *S0 = src[k]; |
|
|
|
|
int *D0 = dst[k]; |
|
|
|
|
const uchar *S1 = src[k+1]; |
|
|
|
|
int *D1 = dst[k+1]; |
|
|
|
|
|
|
|
|
|
for( dx = 0; dx < len0; dx += step ) |
|
|
|
|
{ |
|
|
|
|
v_int16x8 a0 = v_load(alpha+dx*2); |
|
|
|
|
v_int16x8 a1 = v_load(alpha+dx*2 + 8); |
|
|
|
|
v_int16x8 a2 = v_load(alpha+dx*2 + 16); |
|
|
|
|
v_int16x8 a3 = v_load(alpha+dx*2 + 24); |
|
|
|
|
v_uint8x16 s01, s23; |
|
|
|
|
v_zip(v_lut(S0, xofs+dx), v_lut(S0+cn, xofs+dx), s01, s23); |
|
|
|
|
v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(v_expand_low(s01)), a0)); |
|
|
|
|
v_store(&D0[dx+4], v_dotprod(v_reinterpret_as_s16(v_expand_high(s01)), a1)); |
|
|
|
|
v_store(&D0[dx+8], v_dotprod(v_reinterpret_as_s16(v_expand_low(s23)), a2)); |
|
|
|
|
v_store(&D0[dx+12], v_dotprod(v_reinterpret_as_s16(v_expand_high(s23)), a3)); |
|
|
|
|
v_zip(v_lut(S1, xofs+dx), v_lut(S1+cn, xofs+dx), s01, s23); |
|
|
|
|
v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(v_expand_low(s01)), a0)); |
|
|
|
|
v_store(&D1[dx+4], v_dotprod(v_reinterpret_as_s16(v_expand_high(s01)), a1)); |
|
|
|
|
v_store(&D1[dx+8], v_dotprod(v_reinterpret_as_s16(v_expand_low(s23)), a2)); |
|
|
|
|
v_store(&D1[dx+12], v_dotprod(v_reinterpret_as_s16(v_expand_high(s23)), a3)); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
for( ; k < count; k++ ) |
|
|
|
|
{ |
|
|
|
|
const uchar *S = src[k]; |
|
|
|
|
int *D = dst[k]; |
|
|
|
|
for( dx = 0; dx < len0; dx += step ) |
|
|
|
|
{ |
|
|
|
|
v_int16x8 a0 = v_load(alpha+dx*2); |
|
|
|
|
v_int16x8 a1 = v_load(alpha+dx*2 + 8); |
|
|
|
|
v_int16x8 a2 = v_load(alpha+dx*2 + 16); |
|
|
|
|
v_int16x8 a3 = v_load(alpha+dx*2 + 24); |
|
|
|
|
v_uint8x16 s01, s23; |
|
|
|
|
v_zip(v_lut(S, xofs+dx), v_lut(S+cn, xofs+dx), s01, s23); |
|
|
|
|
v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(v_expand_low(s01)), a0)); |
|
|
|
|
v_store(&D[dx+4], v_dotprod(v_reinterpret_as_s16(v_expand_high(s01)), a1)); |
|
|
|
|
v_store(&D[dx+8], v_dotprod(v_reinterpret_as_s16(v_expand_low(s23)), a2)); |
|
|
|
|
v_store(&D[dx+12], v_dotprod(v_reinterpret_as_s16(v_expand_high(s23)), a3)); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
return 0; // images with channels >4 are out of optimization scope
|
|
|
|
|
} |
|
|
|
|
return dx; |
|
|
|
|
} |
|
|
|
|