@ -82,7 +82,7 @@ SECTION .text
% endif
% endmacro
% macro PULSES_SEARCH 2 ; % 1 - add or sub, %2 - use approximation
% macro PULSES_SEARCH 1
; m6 Syy_norm
; m7 Sxy_norm
addps m6 , mm_const_float_0_5 ; Syy_norm += 1.0/2
@ -96,17 +96,17 @@ align 16
movaps m4 , [ tmpY + r4 ] ; y[i]
movaps m5 , [ tmpX + r4 ] ; X[i]
% if %2
% if USE_APPROXIMATION == 1
xorps m0 , m0
cmpps m0 , m0 , m5 , 4 ; m0 = (X[i] != 0.0)
% endif
%endif
addps m4 , m6 ; m4 = Syy_new = y[i] + Syy_norm
addps m5 , m7 ; m5 = Sxy_new = X[i] + Sxy_norm
% if %2
% if USE_APPROXIMATION == 1
andps m5 , m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
% endif
%endif
% else
movaps m5 , [ tmpY + r4 ] ; m5 = y[i]
@ -119,7 +119,7 @@ align 16
andps m5 , m0 ; (0<y)?m5:0
% endif
% if %2
% if USE_APPROXIMATION == 1
rsqrtps m4 , m4
mulps m5 , m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
% else
@ -211,13 +211,8 @@ align 16
; uint32 K - Number of pulses to have after quantizations.
; uint32 N - Number of vector elements. Must be 0 < N < 256
;
% macro PVQ_FAST_SEARCH 1 ; %1 - use approximation
% if %1
cglobal pvq_search_approx , 4 , 5 + num_pic_regs , 11 , 256 * 4 , inX , outY , K , N
% else
cglobal pvq_search_exact , 4 , 5 + num_pic_regs , 11 , 256 * 4 , inX , outY , K , N
% endif
% macro PVQ_FAST_SEARCH 1
cglobal pvq_search % 1 , 4 , 5 + num_pic_regs , 11 , 256 * 4 , inX , outY , K , N
% define tmpX rsp
% define tmpY outYq
@ -260,7 +255,7 @@ align 16
jz %% zero_input ; if (Sx==0) goto zero_input
cvtsi2ss xm0 , dword Kd ; m0 = K
% if % 1
% if USE_APPROXIMATION == 1
rcpss xm1 , xm1 ; m1 = approx(1/Sx)
mulss xm0 , xm1 ; m0 = K*(1/Sx)
% else
@ -313,7 +308,7 @@ align 16
align 16 ; K - pulses > 0
% %add_pulses_loop:
PULSES_SEARCH add , % 1 ; m6 Syy_norm ; m7 Sxy_norm
PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm
sub Kd , 1
jnz %% add_pulses_loop
@ -325,7 +320,7 @@ align 16 ; K - pulses > 0
align 16
% %remove_pulses_loop:
PULSES_SEARCH sub , % 1 ; m6 Syy_norm ; m7 Sxy_norm
PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm
add Kd , 1
jnz %% remove_pulses_loop
@ -376,11 +371,15 @@ align 16
; On Skylake & Ryzen the division is much faster (around 11c/3),
; that makes the full precision code about 2% slower.
; Opus also does use rsqrt approximation in their intrinsics code.
% define USE_APPROXIMATION 1
INIT_XMM ss e2
PVQ_FAST_SEARCH 1
PVQ_FAST_SEARCH _approx
INIT_XMM ss e4
PVQ_FAST_SEARCH 1
PVQ_FAST_SEARCH _approx
% define USE_APPROXIMATION 0
INIT_XMM avx
PVQ_FAST_SEARCH 0
PVQ_FAST_SEARCH _exact