H.264: tweak some other x86 asm for Atom

pull/2/head
Jason Garrett-Glaser 14 years ago
parent 5ef953e84f
commit a3bf7b864a
  1. 4
      libavcodec/x86/dsputil_mmx.c
  2. 44
      libavcodec/x86/h264_chromamc.asm
  3. 19
      libavcodec/x86/h264_deblock.asm
  4. 28
      libavcodec/x86/h264_idct.asm
  5. 4
      libavcodec/x86/x86util.asm

@ -456,12 +456,12 @@ static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_si
"movdqu (%1,%3), %%xmm1 \n\t" "movdqu (%1,%3), %%xmm1 \n\t"
"movdqu (%1,%3,2), %%xmm2 \n\t" "movdqu (%1,%3,2), %%xmm2 \n\t"
"movdqu (%1,%4), %%xmm3 \n\t" "movdqu (%1,%4), %%xmm3 \n\t"
"lea (%1,%3,4), %1 \n\t"
"movdqa %%xmm0, (%2) \n\t" "movdqa %%xmm0, (%2) \n\t"
"movdqa %%xmm1, (%2,%3) \n\t" "movdqa %%xmm1, (%2,%3) \n\t"
"movdqa %%xmm2, (%2,%3,2) \n\t" "movdqa %%xmm2, (%2,%3,2) \n\t"
"movdqa %%xmm3, (%2,%4) \n\t" "movdqa %%xmm3, (%2,%4) \n\t"
"subl $4, %0 \n\t" "subl $4, %0 \n\t"
"lea (%1,%3,4), %1 \n\t"
"lea (%2,%3,4), %2 \n\t" "lea (%2,%3,4), %2 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
: "+g"(h), "+r" (pixels), "+r" (block) : "+g"(h), "+r" (pixels), "+r" (block)
@ -478,6 +478,7 @@ static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_si
"movdqu (%1,%3), %%xmm1 \n\t" "movdqu (%1,%3), %%xmm1 \n\t"
"movdqu (%1,%3,2), %%xmm2 \n\t" "movdqu (%1,%3,2), %%xmm2 \n\t"
"movdqu (%1,%4), %%xmm3 \n\t" "movdqu (%1,%4), %%xmm3 \n\t"
"lea (%1,%3,4), %1 \n\t"
"pavgb (%2), %%xmm0 \n\t" "pavgb (%2), %%xmm0 \n\t"
"pavgb (%2,%3), %%xmm1 \n\t" "pavgb (%2,%3), %%xmm1 \n\t"
"pavgb (%2,%3,2), %%xmm2 \n\t" "pavgb (%2,%3,2), %%xmm2 \n\t"
@ -487,7 +488,6 @@ static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_si
"movdqa %%xmm2, (%2,%3,2) \n\t" "movdqa %%xmm2, (%2,%3,2) \n\t"
"movdqa %%xmm3, (%2,%4) \n\t" "movdqa %%xmm3, (%2,%4) \n\t"
"subl $4, %0 \n\t" "subl $4, %0 \n\t"
"lea (%1,%3,4), %1 \n\t"
"lea (%2,%3,4), %2 \n\t" "lea (%2,%3,4), %2 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
: "+g"(h), "+r" (pixels), "+r" (block) : "+g"(h), "+r" (pixels), "+r" (block)

@ -72,17 +72,17 @@ SECTION .text
.next4rows .next4rows
movq mm0, [r1 ] movq mm0, [r1 ]
movq mm1, [r1+r2] movq mm1, [r1+r2]
add r1, r4
CHROMAMC_AVG mm0, [r0 ] CHROMAMC_AVG mm0, [r0 ]
CHROMAMC_AVG mm1, [r0+r2] CHROMAMC_AVG mm1, [r0+r2]
movq [r0 ], mm0 movq [r0 ], mm0
movq [r0+r2], mm1 movq [r0+r2], mm1
add r0, r4 add r0, r4
add r1, r4
movq mm0, [r1 ] movq mm0, [r1 ]
movq mm1, [r1+r2] movq mm1, [r1+r2]
add r1, r4
CHROMAMC_AVG mm0, [r0 ] CHROMAMC_AVG mm0, [r0 ]
CHROMAMC_AVG mm1, [r0+r2] CHROMAMC_AVG mm1, [r0+r2]
add r1, r4
movq [r0 ], mm0 movq [r0 ], mm0
movq [r0+r2], mm1 movq [r0+r2], mm1
add r0, r4 add r0, r4
@ -472,8 +472,8 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8
mov r6d, r4d mov r6d, r4d
shl r4d, 8 shl r4d, 8
sub r4, r6 sub r4, r6
add r4, 8 ; x*288+8 = x<<8 | (8-x)
mov r6, 8 mov r6, 8
add r4, 8 ; x*288+8 = x<<8 | (8-x)
sub r6d, r5d sub r6d, r5d
imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
@ -481,24 +481,23 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8
movd m7, r6d movd m7, r6d
movd m6, r4d movd m6, r4d
movdqa m5, [rnd_2d_%2] movdqa m5, [rnd_2d_%2]
movq m0, [r1 ]
movq m1, [r1+1]
pshuflw m7, m7, 0 pshuflw m7, m7, 0
pshuflw m6, m6, 0 pshuflw m6, m6, 0
punpcklbw m0, m1
movlhps m7, m7 movlhps m7, m7
movlhps m6, m6 movlhps m6, m6
movq m0, [r1 ]
movq m1, [r1 +1]
punpcklbw m0, m1
add r1, r2
.next2rows .next2rows
movq m1, [r1 ] movq m1, [r1+r2*1 ]
movq m2, [r1 +1] movq m2, [r1+r2*1+1]
movq m3, [r1+r2 ] movq m3, [r1+r2*2 ]
movq m4, [r1+r2+1] movq m4, [r1+r2*2+1]
lea r1, [r1+r2*2] lea r1, [r1+r2*2]
punpcklbw m1, m2 punpcklbw m1, m2
punpcklbw m3, m4
movdqa m2, m1 movdqa m2, m1
punpcklbw m3, m4
movdqa m4, m3 movdqa m4, m3
pmaddubsw m0, m7 pmaddubsw m0, m7
pmaddubsw m1, m6 pmaddubsw m1, m6
@ -508,8 +507,8 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8
paddw m2, m5 paddw m2, m5
paddw m1, m0 paddw m1, m0
paddw m3, m2 paddw m3, m2
movdqa m0, m4
psrlw m1, 6 psrlw m1, 6
movdqa m0, m4
psrlw m3, 6 psrlw m3, 6
%ifidn %1, avg %ifidn %1, avg
movq m2, [r0 ] movq m2, [r0 ]
@ -576,6 +575,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8
movq m1, [r1+r2 ] movq m1, [r1+r2 ]
movdqa m2, m1 movdqa m2, m1
movq m3, [r1+r2*2] movq m3, [r1+r2*2]
lea r1, [r1+r2*2]
punpcklbw m0, m1 punpcklbw m0, m1
punpcklbw m2, m3 punpcklbw m2, m3
pmaddubsw m0, m7 pmaddubsw m0, m7
@ -594,7 +594,6 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8
movhps [r0+r2], m0 movhps [r0+r2], m0
sub r3d, 2 sub r3d, 2
lea r0, [r0+r2*2] lea r0, [r0+r2*2]
lea r1, [r1+r2*2]
jg .next2yrows jg .next2yrows
REP_RET REP_RET
%endmacro %endmacro
@ -607,8 +606,8 @@ cglobal %1_%2_chroma_mc4_%3, 6, 7, 0
mov r6, r4 mov r6, r4
shl r4d, 8 shl r4d, 8
sub r4d, r6d sub r4d, r6d
add r4d, 8 ; x*288+8
mov r6, 8 mov r6, 8
add r4d, 8 ; x*288+8
sub r6d, r5d sub r6d, r5d
imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
@ -616,17 +615,16 @@ cglobal %1_%2_chroma_mc4_%3, 6, 7, 0
movd m7, r6d movd m7, r6d
movd m6, r4d movd m6, r4d
movq m5, [pw_32] movq m5, [pw_32]
movd m0, [r1 ]
pshufw m7, m7, 0 pshufw m7, m7, 0
punpcklbw m0, [r1+1]
pshufw m6, m6, 0 pshufw m6, m6, 0
movd m0, [r1 ]
punpcklbw m0, [r1 +1]
add r1, r2
.next2rows .next2rows
movd m1, [r1 ] movd m1, [r1+r2*1 ]
movd m3, [r1+r2 ] movd m3, [r1+r2*2 ]
punpcklbw m1, [r1 +1] punpcklbw m1, [r1+r2*1+1]
punpcklbw m3, [r1+r2+1] punpcklbw m3, [r1+r2*2+1]
lea r1, [r1+r2*2] lea r1, [r1+r2*2]
movq m2, m1 movq m2, m1
movq m4, m3 movq m4, m3
@ -638,8 +636,8 @@ cglobal %1_%2_chroma_mc4_%3, 6, 7, 0
paddw m2, m5 paddw m2, m5
paddw m1, m0 paddw m1, m0
paddw m3, m2 paddw m3, m2
movq m0, m4
psrlw m1, 6 psrlw m1, 6
movq m0, m4
psrlw m3, 6 psrlw m3, 6
packuswb m1, m1 packuswb m1, m1
packuswb m3, m3 packuswb m3, m3

@ -240,17 +240,17 @@ cextern pb_A1
; out: m1=p0' m2=q0' ; out: m1=p0' m2=q0'
; clobbers: m0,3-6 ; clobbers: m0,3-6
%macro DEBLOCK_P0_Q0 0 %macro DEBLOCK_P0_Q0 0
pxor m5, m1, m2 ; p0^q0
pand m5, [pb_1] ; (p0^q0)&1
pcmpeqb m4, m4 pcmpeqb m4, m4
pxor m5, m1, m2 ; p0^q0
pxor m3, m4 pxor m3, m4
pand m5, [pb_1] ; (p0^q0)&1
pavgb m3, m0 ; (p1 - q1 + 256)>>1 pavgb m3, m0 ; (p1 - q1 + 256)>>1
pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
pxor m4, m1 pxor m4, m1
pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
pavgb m4, m2 ; (q0 - p0 + 256)>>1 pavgb m4, m2 ; (q0 - p0 + 256)>>1
pavgb m3, m5 pavgb m3, m5
paddusb m3, m4 ; d+128+33
mova m6, [pb_A1] mova m6, [pb_A1]
paddusb m3, m4 ; d+128+33
psubusb m6, m3 psubusb m6, m3
psubusb m3, [pb_A1] psubusb m3, [pb_A1]
pminub m6, m7 pminub m6, m7
@ -411,16 +411,16 @@ cglobal deblock_%2_luma_8_%1, 5,5
LOAD_MASK r2, r3 LOAD_MASK r2, r3
mov r3, r4mp mov r3, r4mp
pcmpeqb m3, m3
movd m4, [r3] ; tc0 movd m4, [r3] ; tc0
punpcklbw m4, m4 punpcklbw m4, m4
punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
mova [esp+%3], m4 ; tc mova [esp+%3], m4 ; tc
pcmpeqb m3, m3
pcmpgtb m4, m3 pcmpgtb m4, m3
mova m3, [r4] ; p2
pand m4, m7 pand m4, m7
mova [esp], m4 ; mask mova [esp], m4 ; mask
mova m3, [r4] ; p2
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
pand m6, m4 pand m6, m4
pand m4, [esp+%3] ; tc pand m4, [esp+%3] ; tc
@ -430,11 +430,10 @@ cglobal deblock_%2_luma_8_%1, 5,5
mova m4, [r0+2*r1] ; q2 mova m4, [r0+2*r1] ; q2
DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
mova m5, [esp] ; mask pand m6, [esp] ; mask
pand m6, m5
mova m5, [esp+%3] ; tc mova m5, [esp+%3] ; tc
pand m5, m6
psubb m7, m6 psubb m7, m6
pand m5, m6
mova m3, [r0+r1] mova m3, [r0+r1]
LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
@ -482,10 +481,10 @@ cglobal deblock_h_luma_8_%1, 0,5
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
mov r0, r0mp mov r0, r0mp
sub r0, 2 sub r0, 2
lea r1, [r0+r4]
movq m0, [pix_tmp+0x10] movq m0, [pix_tmp+0x10]
movq m1, [pix_tmp+0x20] movq m1, [pix_tmp+0x20]
lea r1, [r0+r4]
movq m2, [pix_tmp+0x30] movq m2, [pix_tmp+0x30]
movq m3, [pix_tmp+0x40] movq m3, [pix_tmp+0x40]
TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)

@ -82,10 +82,10 @@ cglobal h264_idct_add_8_mmx, 3, 3, 0
RET RET
%macro IDCT8_1D 2 %macro IDCT8_1D 2
mova m4, m5
mova m0, m1 mova m0, m1
psraw m4, 1
psraw m1, 1 psraw m1, 1
mova m4, m5
psraw m4, 1
paddw m4, m5 paddw m4, m5
paddw m1, m0 paddw m1, m0
paddw m4, m7 paddw m4, m7
@ -95,16 +95,16 @@ cglobal h264_idct_add_8_mmx, 3, 3, 0
psubw m0, m3 psubw m0, m3
psubw m5, m3 psubw m5, m3
psraw m3, 1
paddw m0, m7 paddw m0, m7
psubw m5, m7 psubw m5, m7
psraw m3, 1
psraw m7, 1 psraw m7, 1
psubw m0, m3 psubw m0, m3
psubw m5, m7 psubw m5, m7
mova m3, m4
mova m7, m1 mova m7, m1
psraw m1, 2 psraw m1, 2
mova m3, m4
psraw m3, 2 psraw m3, 2
paddw m3, m0 paddw m3, m0
psraw m0, 2 psraw m0, 2
@ -113,12 +113,12 @@ cglobal h264_idct_add_8_mmx, 3, 3, 0
psubw m0, m4 psubw m0, m4
psubw m7, m5 psubw m7, m5
mova m4, m2
mova m5, m6 mova m5, m6
psraw m4, 1
psraw m6, 1 psraw m6, 1
psubw m4, m5 mova m4, m2
psraw m4, 1
paddw m6, m2 paddw m6, m2
psubw m4, m5
mova m2, %1 mova m2, %1
mova m5, %2 mova m5, %2
@ -337,7 +337,7 @@ cglobal h264_idct8_add4_8_mmx, 5, 7, 0
test r6, r6 test r6, r6
jz .skipblock jz .skipblock
mov r6d, dword [r1+r5*4] mov r6d, dword [r1+r5*4]
lea r6, [r0+r6] add r6, r0
add word [r2], 32 add word [r2], 32
IDCT8_ADD_MMX_START r2 , rsp IDCT8_ADD_MMX_START r2 , rsp
IDCT8_ADD_MMX_START r2+8, rsp+64 IDCT8_ADD_MMX_START r2+8, rsp+64
@ -391,7 +391,7 @@ cglobal h264_idct_add16_8_mmx2, 5, 7, 0
REP_RET REP_RET
.no_dc .no_dc
mov r6d, dword [r1+r5*4] mov r6d, dword [r1+r5*4]
lea r6, [r0+r6] add r6, r0
IDCT4_ADD r6, r2, r3 IDCT4_ADD r6, r2, r3
.skipblock .skipblock
inc r5 inc r5
@ -414,7 +414,7 @@ cglobal h264_idct_add16intra_8_mmx, 5, 7, 0
test r6, r6 test r6, r6
jz .skipblock jz .skipblock
mov r6d, dword [r1+r5*4] mov r6d, dword [r1+r5*4]
lea r6, [r0+r6] add r6, r0
IDCT4_ADD r6, r2, r3 IDCT4_ADD r6, r2, r3
.skipblock .skipblock
inc r5 inc r5
@ -456,7 +456,7 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 7, 0
%define dst_regd r1d %define dst_regd r1d
%endif %endif
mov dst_regd, dword [r1+r5*4] mov dst_regd, dword [r1+r5*4]
lea dst_reg, [r0+dst_reg] add dst_reg, r0
DC_ADD_MMX2_OP movh, dst_reg, r3, r6 DC_ADD_MMX2_OP movh, dst_reg, r3, r6
%ifndef ARCH_X86_64 %ifndef ARCH_X86_64
mov r1, r1m mov r1, r1m
@ -513,7 +513,7 @@ cglobal h264_idct8_add4_8_mmx2, 5, 7, 0
RET RET
.no_dc .no_dc
mov r6d, dword [r1+r5*4] mov r6d, dword [r1+r5*4]
lea r6, [r0+r6] add r6, r0
add word [r2], 32 add word [r2], 32
IDCT8_ADD_MMX_START r2 , rsp IDCT8_ADD_MMX_START r2 , rsp
IDCT8_ADD_MMX_START r2+8, rsp+64 IDCT8_ADD_MMX_START r2+8, rsp+64
@ -558,7 +558,7 @@ INIT_MMX
%define dst_regd r1d %define dst_regd r1d
%endif %endif
mov dst_regd, dword [r1+r5*4] mov dst_regd, dword [r1+r5*4]
lea dst_reg, [r0+dst_reg] add dst_reg, r0
DC_ADD_MMX2_OP mova, dst_reg, r3, r6 DC_ADD_MMX2_OP mova, dst_reg, r3, r6
lea dst_reg, [dst_reg+r3*4] lea dst_reg, [dst_reg+r3*4]
DC_ADD_MMX2_OP mova, dst_reg, r3, r6 DC_ADD_MMX2_OP mova, dst_reg, r3, r6
@ -573,7 +573,7 @@ INIT_MMX
.no_dc .no_dc
INIT_XMM INIT_XMM
mov dst_regd, dword [r1+r5*4] mov dst_regd, dword [r1+r5*4]
lea dst_reg, [r0+dst_reg] add dst_reg, r0
IDCT8_ADD_SSE dst_reg, r2, r3, r6 IDCT8_ADD_SSE dst_reg, r2, r3, r6
%ifndef ARCH_X86_64 %ifndef ARCH_X86_64
mov r1, r1m mov r1, r1m

@ -497,10 +497,10 @@
%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride %macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride
movh %3, [%7] movh %3, [%7]
movh %4, [%7+%8] movh %4, [%7+%8]
punpcklbw %3, %5
punpcklbw %4, %5
psraw %1, %6 psraw %1, %6
psraw %2, %6 psraw %2, %6
punpcklbw %3, %5
punpcklbw %4, %5
paddw %3, %1 paddw %3, %1
paddw %4, %2 paddw %4, %2
packuswb %3, %5 packuswb %3, %5

Loading…
Cancel
Save