@ -804,62 +804,53 @@ cglobal h264_idct_add16intra_sse2, 5, 7, 8
jl .next2blocks
REP_RET
h264_idct_add8_sse2_plane:
.next2blocks
movzx r0 , byte [ scan8 + r5 ]
movzx r0 , word [ r4 + r0 ]
% macro add8_sse2_cycle 2
movzx r0 , word [ r4 +% 2 ]
test r0 , r0
jz .try_ dc
jz .try % 1 dc
% ifdef ARCH_X86_64
mov r0d , dword [ r1 + r5 * 4 ]
mov r0d , dword [ r1 +% 1 * 8 + 6 4 ]
add r0 , [ r10 ]
% else
mov r0 , r1m ; XXX r1m here is actually r 0m of the calling func
mov r0 , r0m
mov r0 , [ r0 ]
add r0 , dword [ r1 + r5 * 4 ]
add r0 , dword [ r1 +% 1 * 8 + 6 4 ]
% endif
call x264_add8x4_idct_sse2
add r5 , 2
add r2 , 64
test r5 , 3
jnz .next2blocks
rep ret
.try_dc
jmp .cycle % 1 end
.try % 1 dc
movsx r0 , word [ r2 ]
or r0w , word [ r2 + 32 ]
jz .skip2blocks
jz .cycle % 1 end
% ifdef ARCH_X86_64
mov r0d , dword [ r1 + r5 * 4 ]
mov r0d , dword [ r1 +% 1 * 8 + 6 4 ]
add r0 , [ r10 ]
% else
mov r0 , r1m ; XXX r1m here is actually r 0m of the calling func
mov r0 , r0m
mov r0 , [ r0 ]
add r0 , dword [ r1 + r5 * 4 ]
add r0 , dword [ r1 +% 1 * 8 + 6 4 ]
% endif
call h264_idct_dc_add8_mmx2
.skip2blocks
add r5 , 2
.cycle % 1 end
% if %1 < 3
add r2 , 64
test r5 , 3
jnz .next2blocks
rep ret
% endif
% endmacro
; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
cglobal h264_idct_add8_sse2 , 5 , 7 , 8
mov r5 , 16
add r2 , 512
% ifdef PIC
lea r11 , [ scan8_mem ]
% endif
% ifdef ARCH_X86_64
mov r10 , r0
% endif
call h264_idct_add8_sse2_plane
add8_sse2_cycle 0 , 0x09
add8_sse2_cycle 1 , 0x11
% ifdef ARCH_X86_64
add r10 , gprsize
% else
add r0mp , gprsize
% endif
call h264_idct_add8_sse2_plane
add8_sse2_cycle 2 , 0x21
add8_sse2_cycle 3 , 0x29
RET