diff --git a/libswscale/x86/input.asm b/libswscale/x86/input.asm index 21cd8b37fd..516e4384b1 100644 --- a/libswscale/x86/input.asm +++ b/libswscale/x86/input.asm @@ -736,11 +736,11 @@ cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w packuswb m2, m3 ; (byte) { U0, ..., U15 } packuswb m0, m1 ; (byte) { V0, ..., V15 } %ifidn %2, nv12 - mova [dstUq+wq], m2 - mova [dstVq+wq], m0 + mov%1 [dstUq+wq], m2 + mov%1 [dstVq+wq], m0 %else ; nv21 - mova [dstVq+wq], m2 - mova [dstUq+wq], m0 + mov%1 [dstVq+wq], m2 + mov%1 [dstUq+wq], m0 %endif ; nv12/21 add wq, mmsize jl .loop_%1 @@ -750,15 +750,18 @@ cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w ; %1 = nr. of XMM registers ; %2 = nv12 or nv21 %macro NVXX_TO_UV_FN 2 -cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w +cglobal %2ToUV, 4, 5, %1, dstU, dstV, tmp, src, w %if ARCH_X86_64 movsxd wq, dword r5m %else ; x86-32 mov wq, r5m %endif + mov tmpq, srcq + or tmpq, dstUq + or tmpq, dstVq add dstUq, wq add dstVq, wq - test srcq, 15 + test tmpq, 15 lea srcq, [srcq+wq*2] pcmpeqb m5, m5 ; (byte) { 0xff } x 16 psrlw m5, 8 ; (word) { 0x00ff } x 8