x86/tx_float: adjust internal ASM call ABI again

There are many ways to go about it, and this one seems optimal for both
MDCTs and PFA FFTs without requiring excessive instructions or stack usage.
pull/388/head
Lynne 2 years ago
parent 7e7baf8ab8
commit 3241e9225c
No known key found for this signature in database
GPG Key ID: A2FEA5F03F034464
  1. 28
      libavutil/x86/tx_float.asm

@ -22,11 +22,10 @@
; based upon and compare. ; based upon and compare.
; Intra-asm call convention: ; Intra-asm call convention:
; 272 bytes of stack available ; 320 bytes of stack available
; First 10 GPRs available ; 14 GPRs available (last 4 must not be clobbered)
; Additionally, don't clobber ctx, in, out, len, lut
; All vector regs available ; All vector regs available
; Don't clobber ctx, len, lut
; in and out must point to the end
; TODO: ; TODO:
; carry over registers from smaller transforms to save on ~8 loads/stores ; carry over registers from smaller transforms to save on ~8 loads/stores
@ -686,8 +685,6 @@ cglobal fft2_asm_float, 0, 0, 0, ctx, out, in, stride
movaps m0, [inq] movaps m0, [inq]
FFT2 m0, m1 FFT2 m0, m1
movaps [outq], m0 movaps [outq], m0
add inq, mmsize*1
add outq, mmsize*1
ret ret
cglobal fft2_float, 4, 4, 2, ctx, out, in, stride cglobal fft2_float, 4, 4, 2, ctx, out, in, stride
@ -721,8 +718,6 @@ cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride
movaps [outq + 1*mmsize], m0 movaps [outq + 1*mmsize], m0
%if %3 %if %3
add inq, mmsize*2
add outq, mmsize*2
ret ret
%else %else
RET RET
@ -764,8 +759,6 @@ cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp
movups [outq + 3*mmsize], m1 movups [outq + 3*mmsize], m1
%if %1 %if %1
add inq, mmsize*4
add outq, mmsize*4
ret ret
%else %else
RET RET
@ -806,8 +799,6 @@ cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp
vextractf128 [outq + 16*3], m0, 1 vextractf128 [outq + 16*3], m0, 1
%if %1 %if %1
add inq, mmsize*2
add outq, mmsize*2
ret ret
%else %else
RET RET
@ -857,8 +848,6 @@ cglobal fft16_float, 4, 4, 8, ctx, out, in, tmp
vextractf128 [outq + 16*7], m1, 1 vextractf128 [outq + 16*7], m1, 1
%if %2 %if %2
add inq, mmsize*4
add outq, mmsize*4
ret ret
%else %else
RET RET
@ -943,8 +932,6 @@ cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp
vextractf128 [outq + 16*15], m5, 1 vextractf128 [outq + 16*15], m5, 1
%if %2 %if %2
add inq, mmsize*8
add outq, mmsize*8
ret ret
%else %else
RET RET
@ -1282,12 +1269,13 @@ FFT_SPLIT_RADIX_DEF 131072
add outq, 8*mmsize add outq, 8*mmsize
add rtabq, 4*mmsize add rtabq, 4*mmsize
sub itabq, 4*mmsize sub itabq, 4*mmsize
sub lenq, 4*mmsize sub tgtq, 4*mmsize
jg .synth_deinterleave jg .synth_deinterleave
%if %2 %if %2
mov lenq, tgtq sub outq, tmpq
add outq, offq neg tmpq
lea inq, [inq + tmpq*4]
ret ret
%else %else
RET RET
@ -1369,7 +1357,7 @@ FFT_SPLIT_RADIX_DEF 131072
vextractf128 [outq + 15*mmsize + 16], tx2_e1, 1 vextractf128 [outq + 15*mmsize + 16], tx2_e1, 1
%if %2 %if %2
add outq, 16*mmsize sub inq, 16*mmsize
ret ret
%else %else
RET RET

Loading…
Cancel
Save