x86: use 32-bit source registers with movd instruction

yasm tolerates mismatch between movd/movq and source register size,
adjusting the instruction according to the register.  nasm is more
strict.

Signed-off-by: Mans Rullgard <mans@mansr.com>
pull/28/head
Mans Rullgard 13 years ago
parent a3df4781f4
commit 2b140a3d09
  1. 12
      libavcodec/x86/h264_deblock_10bit.asm
  2. 6
      libavcodec/x86/rv34dsp.asm
  3. 4
      libavcodec/x86/rv40dsp.asm

@ -165,7 +165,7 @@ cglobal deblock_v_luma_10, 5,5,8*(mmsize/16)
SUB rsp, pad SUB rsp, pad
shl r2d, 2 shl r2d, 2
shl r3d, 2 shl r3d, 2
LOAD_AB m4, m5, r2, r3 LOAD_AB m4, m5, r2d, r3d
mov r3, 32/mmsize mov r3, 32/mmsize
mov r2, r0 mov r2, r0
sub r0, r1 sub r0, r1
@ -222,7 +222,7 @@ cglobal deblock_h_luma_10, 5,6,8*(mmsize/16)
SUB rsp, pad SUB rsp, pad
shl r2d, 2 shl r2d, 2
shl r3d, 2 shl r3d, 2
LOAD_AB m4, m5, r2, r3 LOAD_AB m4, m5, r2d, r3d
mov r3, r1 mov r3, r1
mova am, m4 mova am, m4
add r3, r1 add r3, r1
@ -351,7 +351,7 @@ cglobal deblock_v_luma_10, 5,5,15
%define mask2 m11 %define mask2 m11
shl r2d, 2 shl r2d, 2
shl r3d, 2 shl r3d, 2
LOAD_AB m12, m13, r2, r3 LOAD_AB m12, m13, r2d, r3d
mov r2, r0 mov r2, r0
sub r0, r1 sub r0, r1
sub r0, r1 sub r0, r1
@ -379,7 +379,7 @@ cglobal deblock_v_luma_10, 5,5,15
cglobal deblock_h_luma_10, 5,7,15 cglobal deblock_h_luma_10, 5,7,15
shl r2d, 2 shl r2d, 2
shl r3d, 2 shl r3d, 2
LOAD_AB m12, m13, r2, r3 LOAD_AB m12, m13, r2d, r3d
mov r2, r1 mov r2, r1
add r2, r1 add r2, r1
add r2, r1 add r2, r1
@ -857,7 +857,7 @@ cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16)
.loop: .loop:
%endif %endif
CHROMA_V_LOAD r5 CHROMA_V_LOAD r5
LOAD_AB m4, m5, r2, r3 LOAD_AB m4, m5, r2d, r3d
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
pxor m4, m4 pxor m4, m4
CHROMA_V_LOAD_TC m6, r4 CHROMA_V_LOAD_TC m6, r4
@ -891,7 +891,7 @@ cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16)
.loop: .loop:
%endif %endif
CHROMA_V_LOAD r4 CHROMA_V_LOAD r4
LOAD_AB m4, m5, r2, r3 LOAD_AB m4, m5, r2d, r3d
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6 CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
CHROMA_V_STORE CHROMA_V_STORE

@ -49,7 +49,7 @@ SECTION .text
cglobal rv34_idct_%1, 1, 2, 0 cglobal rv34_idct_%1, 1, 2, 0
movsx r1, word [r0] movsx r1, word [r0]
IDCT_DC r1 IDCT_DC r1
movd m0, r1 movd m0, r1d
pshufw m0, m0, 0 pshufw m0, m0, 0
movq [r0+ 0], m0 movq [r0+ 0], m0
movq [r0+ 8], m0 movq [r0+ 8], m0
@ -70,7 +70,7 @@ cglobal rv34_idct_dc_add, 3, 3
; calculate DC ; calculate DC
IDCT_DC_ROUND r2 IDCT_DC_ROUND r2
pxor m1, m1 pxor m1, m1
movd m0, r2 movd m0, r2d
psubw m1, m0 psubw m1, m0
packuswb m0, m0 packuswb m0, m0
packuswb m1, m1 packuswb m1, m1
@ -175,7 +175,7 @@ cglobal rv34_idct_dc_add, 3, 3, 6
pxor m1, m1 pxor m1, m1
; calculate DC ; calculate DC
movd m0, r2 movd m0, r2d
lea r2, [r0+r1*2] lea r2, [r0+r1*2]
movd m2, [r0] movd m2, [r0]
movd m3, [r0+r1] movd m3, [r0+r1]

@ -466,8 +466,8 @@ cglobal rv40_weight_func_%1_%2, 6, 7, 8
add r2, r6 add r2, r6
neg r6 neg r6
movd m2, r3 movd m2, r3d
movd m3, r4 movd m3, r4d
%ifidn %1,rnd %ifidn %1,rnd
%define RND 0 %define RND 0
SPLATW m2, m2 SPLATW m2, m2

Loading…
Cancel
Save