Unroll loop in h264_idct_add8_sse2(). This means we can inline scan8[] in the

code directly also and remove loop setup. 20% faster in function, 0.8% overall.

See "[PATCH] unroll loop in h264_idct_add8_sse2()" thread on ML.

Originally committed as revision 25171 to svn://svn.ffmpeg.org/ffmpeg/trunk
oldabi
Ronald S. Bultje 15 years ago
parent d801f1c848
commit 4bca677494
  1. 49
      libavcodec/x86/h264_idct.asm

@ -804,62 +804,53 @@ cglobal h264_idct_add16intra_sse2, 5, 7, 8
jl .next2blocks jl .next2blocks
REP_RET REP_RET
h264_idct_add8_sse2_plane: %macro add8_sse2_cycle 2
.next2blocks movzx r0, word [r4+%2]
movzx r0, byte [scan8+r5]
movzx r0, word [r4+r0]
test r0, r0 test r0, r0
jz .try_dc jz .try%1dc
%ifdef ARCH_X86_64 %ifdef ARCH_X86_64
mov r0d, dword [r1+r5*4] mov r0d, dword [r1+%1*8+64]
add r0, [r10] add r0, [r10]
%else %else
mov r0, r1m ; XXX r1m here is actually r0m of the calling func mov r0, r0m
mov r0, [r0] mov r0, [r0]
add r0, dword [r1+r5*4] add r0, dword [r1+%1*8+64]
%endif %endif
call x264_add8x4_idct_sse2 call x264_add8x4_idct_sse2
add r5, 2 jmp .cycle%1end
add r2, 64 .try%1dc
test r5, 3
jnz .next2blocks
rep ret
.try_dc
movsx r0, word [r2 ] movsx r0, word [r2 ]
or r0w, word [r2+32] or r0w, word [r2+32]
jz .skip2blocks jz .cycle%1end
%ifdef ARCH_X86_64 %ifdef ARCH_X86_64
mov r0d, dword [r1+r5*4] mov r0d, dword [r1+%1*8+64]
add r0, [r10] add r0, [r10]
%else %else
mov r0, r1m ; XXX r1m here is actually r0m of the calling func mov r0, r0m
mov r0, [r0] mov r0, [r0]
add r0, dword [r1+r5*4] add r0, dword [r1+%1*8+64]
%endif %endif
call h264_idct_dc_add8_mmx2 call h264_idct_dc_add8_mmx2
.skip2blocks .cycle%1end
add r5, 2 %if %1 < 3
add r2, 64 add r2, 64
test r5, 3 %endif
jnz .next2blocks %endmacro
rep ret
; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
cglobal h264_idct_add8_sse2, 5, 7, 8 cglobal h264_idct_add8_sse2, 5, 7, 8
mov r5, 16
add r2, 512 add r2, 512
%ifdef PIC
lea r11, [scan8_mem]
%endif
%ifdef ARCH_X86_64 %ifdef ARCH_X86_64
mov r10, r0 mov r10, r0
%endif %endif
call h264_idct_add8_sse2_plane add8_sse2_cycle 0, 0x09
add8_sse2_cycle 1, 0x11
%ifdef ARCH_X86_64 %ifdef ARCH_X86_64
add r10, gprsize add r10, gprsize
%else %else
add r0mp, gprsize add r0mp, gprsize
%endif %endif
call h264_idct_add8_sse2_plane add8_sse2_cycle 2, 0x21
add8_sse2_cycle 3, 0x29
RET RET

Loading…
Cancel
Save