diff --git a/libavcodec/x86/fft.asm b/libavcodec/x86/fft.asm index c87752bae8..e4744a3b60 100644 --- a/libavcodec/x86/fft.asm +++ b/libavcodec/x86/fft.asm @@ -667,13 +667,13 @@ cglobal imdct_calc, 3,5,3 push r1 push r0 %else - sub rsp, 8 + sub rsp, 8+32*WIN64 ; allocate win64 shadow space %endif call r4 %if ARCH_X86_32 add esp, 12 %else - add rsp, 8 + add rsp, 8+32*WIN64 %endif POP r1 POP r3 diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index fc6c983052..6e29ce7373 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -331,16 +331,14 @@ cglobal deblock_v_luma_8, 5,5,10 ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_MMX cpuname -cglobal deblock_h_luma_8, 5,9 +cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64 movsxd r7, r1d lea r8, [r7+r7*2] lea r6, [r0-4] lea r5, [r0-4+r8] %if WIN64 - sub rsp, 0x98 - %define pix_tmp rsp+0x30 + %define pix_tmp rsp+0x30 ; shadow space + r4 %else - sub rsp, 0x68 %define pix_tmp rsp %endif @@ -379,11 +377,6 @@ cglobal deblock_h_luma_8, 5,9 movq m3, [pix_tmp+0x40] TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) -%if WIN64 - add rsp, 0x98 -%else - add rsp, 0x68 -%endif RET %endmacro @@ -704,13 +697,16 @@ INIT_MMX cpuname ;----------------------------------------------------------------------------- ; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_h_luma_intra_8, 4,9 +cglobal deblock_h_luma_intra_8, 4,9,0,0x80 movsxd r7, r1d lea r8, [r7*3] lea r6, [r0-4] lea r5, [r0-4+r8] - sub rsp, 0x88 +%if WIN64 + %define pix_tmp rsp+0x20 ; shadow space +%else %define pix_tmp rsp +%endif ; transpose 8x16 -> tmp space TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) @@ -730,7 +726,6 @@ cglobal deblock_h_luma_intra_8, 4,9 sub r5, r7 shr r7, 3 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) - add rsp, 0x88 RET %else cglobal deblock_h_luma_intra_8, 2,4,8,0x80 diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index 6878b355b3..a98f55924d 100644 --- a/libavutil/x86/x86inc.asm +++ b/libavutil/x86/x86inc.asm @@ -334,14 +334,18 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %if stack_size < 0 %assign stack_size -stack_size %endif - %if mmsize != 8 - %assign xmm_regs_used %2 + %assign stack_size_padded stack_size + %if WIN64 + %assign stack_size_padded stack_size_padded + 32 ; reserve 32 bytes for shadow space + %if mmsize != 8 + %assign xmm_regs_used %2 + %if xmm_regs_used > 8 + %assign stack_size_padded stack_size_padded + (xmm_regs_used-8)*16 + %endif + %endif %endif %if mmsize <= 16 && HAVE_ALIGNED_STACK - %assign stack_size_padded stack_size + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1)) - %if xmm_regs_used > 6 - %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16 - %endif + %assign stack_size_padded stack_size_padded + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1)) SUB rsp, stack_size_padded %else %assign %%reg_num (regs_used - 1) @@ -351,14 +355,6 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 ; stack in a single instruction (i.e. mov rsp, rstk or mov ; rsp, [rsp+stack_size_padded]) mov rstk, rsp - %assign stack_size_padded stack_size - %if xmm_regs_used > 6 - %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16 - %if mmsize == 32 && xmm_regs_used & 1 - ; re-align to 32 bytes - %assign stack_size_padded (stack_size_padded + 16) - %endif - %endif %if %1 < 0 ; need to store rsp on stack sub rsp, gprsize+stack_size_padded and rsp, ~(%%stack_alignment-1) @@ -370,9 +366,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %xdefine rstkm rstk %endif %endif - %if xmm_regs_used > 6 - WIN64_PUSH_XMM - %endif + WIN64_PUSH_XMM %endif %endif %endmacro @@ -433,40 +427,55 @@ DECLARE_REG 14, R15, 120 %endmacro %macro WIN64_PUSH_XMM 0 - %assign %%i xmm_regs_used - %rep (xmm_regs_used-6) - %assign %%i %%i-1 - movaps [rsp + (%%i-6)*16 + stack_size + (~stack_offset&8)], xmm %+ %%i - %endrep + ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. + %if xmm_regs_used > 6 + movaps [rstk + stack_offset + 8], xmm6 + %endif + %if xmm_regs_used > 7 + movaps [rstk + stack_offset + 24], xmm7 + %endif + %if xmm_regs_used > 8 + %assign %%i 8 + %rep xmm_regs_used-8 + movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i + %assign %%i %%i+1 + %endrep + %endif %endmacro %macro WIN64_SPILL_XMM 1 %assign xmm_regs_used %1 ASSERT xmm_regs_used <= 16 - %if xmm_regs_used > 6 - SUB rsp, (xmm_regs_used-6)*16+16 - WIN64_PUSH_XMM + %if xmm_regs_used > 8 + %assign stack_size_padded (xmm_regs_used-8)*16 + (~stack_offset&8) + 32 + SUB rsp, stack_size_padded %endif + WIN64_PUSH_XMM %endmacro %macro WIN64_RESTORE_XMM_INTERNAL 1 - %if xmm_regs_used > 6 + %assign %%pad_size 0 + %if xmm_regs_used > 8 %assign %%i xmm_regs_used - %rep (xmm_regs_used-6) + %rep xmm_regs_used-8 %assign %%i %%i-1 - movaps xmm %+ %%i, [%1 + (%%i-6)*16+stack_size+(~stack_offset&8)] + movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32] %endrep - %if stack_size_padded == 0 - add %1, (xmm_regs_used-6)*16+16 - %endif %endif %if stack_size_padded > 0 %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0) mov rsp, rstkm %else add %1, stack_size_padded + %assign %%pad_size stack_size_padded %endif %endif + %if xmm_regs_used > 7 + movaps xmm7, [%1 + stack_offset - %%pad_size + 24] + %endif + %if xmm_regs_used > 6 + movaps xmm6, [%1 + stack_offset - %%pad_size + 8] + %endif %endmacro %macro WIN64_RESTORE_XMM 1 @@ -683,12 +692,12 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %endif align function_align %2: - RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer - %xdefine rstk rsp - %assign stack_offset 0 - %assign stack_size 0 - %assign stack_size_padded 0 - %assign xmm_regs_used 0 + RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer + %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required + %assign stack_offset 0 ; stack pointer offset relative to the return address + %assign stack_size 0 ; amount of stack space that can be freely used inside a function + %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding + %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 %ifnidn %3, "" PROLOGUE %3 %endif