@ -45,8 +45,10 @@ scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
db 4 + 13 * 8 , 5 + 13 * 8 , 4 + 14 * 8 , 5 + 14 * 8
db 4 + 13 * 8 , 5 + 13 * 8 , 4 + 14 * 8 , 5 + 14 * 8
db 6 + 13 * 8 , 7 + 13 * 8 , 6 + 14 * 8 , 7 + 14 * 8
db 6 + 13 * 8 , 7 + 13 * 8 , 6 + 14 * 8 , 7 + 14 * 8
% ifdef PIC
% ifdef PIC
% define scan8 r11
% define npicregs 1
% define scan8 picregq
% else
% else
% define npicregs 0
% define scan8 scan8_mem
% define scan8 scan8_mem
% endif
% endif
@ -301,10 +303,10 @@ cglobal h264_idct8_dc_add_8_mmx2, 3, 3, 0
; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset,
; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
cglobal h264_idct_add16_8_mmx , 5 , 7 , 0
cglobal h264_idct_add16_8_mmx , 5 , 7 + npicregs , 0 , ds t , bl ock_offset , bl ock , stride , nnzc , cntr , coeff , picreg
xor r5 , r5
xor r5 , r5
% ifdef PIC
% ifdef PIC
lea r11 , [ scan8_mem ]
lea picregq , [ scan8_mem ]
% endif
% endif
.nextblock
.nextblock
movzx r6 , byte [ scan8 + r5 ]
movzx r6 , byte [ scan8 + r5 ]
@ -323,13 +325,13 @@ cglobal h264_idct_add16_8_mmx, 5, 7, 0
; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset,
; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
cglobal h264_idct8_add4_8_mmx , 5 , 7 , 0
cglobal h264_idct8_add4_8_mmx , 5 , 7 + npicregs , 0 , ds t , bl ock_offset , bl ock , stride , nnzc , cntr , coeff , picreg
% assign pad 128+4-(stack_offset&7)
% assign pad 128+4-(stack_offset&7)
SUB rsp , pad
SUB rsp , pad
xor r5 , r5
xor r5 , r5
% ifdef PIC
% ifdef PIC
lea r11 , [ scan8_mem ]
lea picregq , [ scan8_mem ]
% endif
% endif
.nextblock
.nextblock
movzx r6 , byte [ scan8 + r5 ]
movzx r6 , byte [ scan8 + r5 ]
@ -355,10 +357,10 @@ cglobal h264_idct8_add4_8_mmx, 5, 7, 0
; ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset,
; ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
cglobal h264_idct_add16_8_mmx2 , 5 , 7 , 0
cglobal h264_idct_add16_8_mmx2 , 5 , 8 + npicregs , 0 , ds t1 , bl ock_offset , bl ock , stride , nnzc , cntr , coeff , ds t2 , picreg
xor r5 , r5
xor r5 , r5
% ifdef PIC
% ifdef PIC
lea r11 , [ scan8_mem ]
lea picregq , [ scan8_mem ]
% endif
% endif
.nextblock
.nextblock
movzx r6 , byte [ scan8 + r5 ]
movzx r6 , byte [ scan8 + r5 ]
@ -371,16 +373,13 @@ cglobal h264_idct_add16_8_mmx2, 5, 7, 0
test r6 , r6
test r6 , r6
jz .no_dc
jz .no_dc
DC_ADD_MMX2_INIT r2 , r3 , r6
DC_ADD_MMX2_INIT r2 , r3 , r6
% if ARCH_X86_64
% if ARCH_X86_64 == 0
% define dst_reg r10
% define dst2q r1
% define dst_regd r10d
% define dst2d r1d
% else
% define dst_reg r1
% define dst_regd r1d
% endif
% endif
mov ds t_reg d, dword [ r1 + r5 * 4 ]
mov ds t2d , dword [ r1 + r5 * 4 ]
lea ds t_reg , [ r0 + ds t_reg ]
lea ds t2q , [ r0 + ds t2q ]
DC_ADD_MMX2_OP movh , ds t_reg , r3 , r6
DC_ADD_MMX2_OP movh , ds t2q , r3 , r6
% if ARCH_X86_64 == 0
% if ARCH_X86_64 == 0
mov r1 , r1m
mov r1 , r1m
% endif
% endif
@ -402,10 +401,10 @@ cglobal h264_idct_add16_8_mmx2, 5, 7, 0
; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset,
; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
cglobal h264_idct_add16intra_8_mmx , 5 , 7 , 0
cglobal h264_idct_add16intra_8_mmx , 5 , 7 + npicregs , 0 , ds t , bl ock_offset , bl ock , stride , nnzc , cntr , coeff , picreg
xor r5 , r5
xor r5 , r5
% ifdef PIC
% ifdef PIC
lea r11 , [ scan8_mem ]
lea picregq , [ scan8_mem ]
% endif
% endif
.nextblock
.nextblock
movzx r6 , byte [ scan8 + r5 ]
movzx r6 , byte [ scan8 + r5 ]
@ -425,10 +424,10 @@ cglobal h264_idct_add16intra_8_mmx, 5, 7, 0
; ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset,
; ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
cglobal h264_idct_add16intra_8_mmx2 , 5 , 7 , 0
cglobal h264_idct_add16intra_8_mmx2 , 5 , 8 + npicregs , 0 , ds t1 , bl ock_offset , bl ock , stride , nnzc , cntr , coeff , ds t2 , picreg
xor r5 , r5
xor r5 , r5
% ifdef PIC
% ifdef PIC
lea r11 , [ scan8_mem ]
lea picregq , [ scan8_mem ]
% endif
% endif
.nextblock
.nextblock
movzx r6 , byte [ scan8 + r5 ]
movzx r6 , byte [ scan8 + r5 ]
@ -448,16 +447,13 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 7, 0
test r6 , r6
test r6 , r6
jz .skipblock
jz .skipblock
DC_ADD_MMX2_INIT r2 , r3 , r6
DC_ADD_MMX2_INIT r2 , r3 , r6
% if ARCH_X86_64
% if ARCH_X86_64 == 0
% define dst_reg r10
% define dst2q r1
% define dst_regd r10d
% define dst2d r1d
% else
% define dst_reg r1
% define dst_regd r1d
% endif
% endif
mov ds t_reg d, dword [ r1 + r5 * 4 ]
mov ds t2d , dword [ r1 + r5 * 4 ]
add ds t_reg , r0
add ds t2q , r0
DC_ADD_MMX2_OP movh , ds t_reg , r3 , r6
DC_ADD_MMX2_OP movh , ds t2q , r3 , r6
% if ARCH_X86_64 == 0
% if ARCH_X86_64 == 0
mov r1 , r1m
mov r1 , r1m
% endif
% endif
@ -470,13 +466,13 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 7, 0
; ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset,
; ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
cglobal h264_idct8_add4_8_mmx2 , 5 , 7 , 0
cglobal h264_idct8_add4_8_mmx2 , 5 , 8 + npicregs , 0 , ds t1 , bl ock_offset , bl ock , stride , nnzc , cntr , coeff , ds t2 , picreg
% assign pad 128+4-(stack_offset&7)
% assign pad 128+4-(stack_offset&7)
SUB rsp , pad
SUB rsp , pad
xor r5 , r5
xor r5 , r5
% ifdef PIC
% ifdef PIC
lea r11 , [ scan8_mem ]
lea picregq , [ scan8_mem ]
% endif
% endif
.nextblock
.nextblock
movzx r6 , byte [ scan8 + r5 ]
movzx r6 , byte [ scan8 + r5 ]
@ -489,18 +485,15 @@ cglobal h264_idct8_add4_8_mmx2, 5, 7, 0
test r6 , r6
test r6 , r6
jz .no_dc
jz .no_dc
DC_ADD_MMX2_INIT r2 , r3 , r6
DC_ADD_MMX2_INIT r2 , r3 , r6
% if ARCH_X86_64
% if ARCH_X86_64 == 0
% define dst_reg r10
% define dst2q r1
% define dst_regd r10d
% define dst2d r1d
% else
% endif
% define dst_reg r1
mov ds t2d , dword [ r1 + r5 * 4 ]
% define dst_regd r1d
lea ds t2q , [ r0 + ds t2q ]
% endif
DC_ADD_MMX2_OP mova , ds t2q , r3 , r6
mov ds t_regd , dword [ r1 + r5 * 4 ]
lea ds t2q , [ ds t2q + r3 * 4 ]
lea ds t_reg , [ r0 + ds t_reg ]
DC_ADD_MMX2_OP mova , ds t2q , r3 , r6
DC_ADD_MMX2_OP mova , ds t_reg , r3 , r6
lea ds t_reg , [ ds t_reg + r3 * 4 ]
DC_ADD_MMX2_OP mova , ds t_reg , r3 , r6
% if ARCH_X86_64 == 0
% if ARCH_X86_64 == 0
mov r1 , r1m
mov r1 , r1m
% endif
% endif
@ -533,10 +526,10 @@ cglobal h264_idct8_add4_8_mmx2, 5, 7, 0
INIT_XMM
INIT_XMM
; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset,
; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
cglobal h264_idct8_add4_8_sse2 , 5 , 7 , 10
cglobal h264_idct8_add4_8_sse2 , 5 , 8 + npicregs , 10 , ds t1 , bl ock_offset , bl ock , stride , nnzc , cntr , coeff , ds t2 , picreg
xor r5 , r5
xor r5 , r5
% ifdef PIC
% ifdef PIC
lea r11 , [ scan8_mem ]
lea picregq , [ scan8_mem ]
% endif
% endif
.nextblock
.nextblock
movzx r6 , byte [ scan8 + r5 ]
movzx r6 , byte [ scan8 + r5 ]
@ -550,18 +543,15 @@ cglobal h264_idct8_add4_8_sse2, 5, 7, 10
jz .no_dc
jz .no_dc
INIT_MMX
INIT_MMX
DC_ADD_MMX2_INIT r2 , r3 , r6
DC_ADD_MMX2_INIT r2 , r3 , r6
% if ARCH_X86_64
% if ARCH_X86_64 == 0
% define dst_reg r10
% define dst2q r1
% define dst_regd r10d
% define dst2d r1d
% else
% endif
% define dst_reg r1
mov ds t2d , dword [ r1 + r5 * 4 ]
% define dst_regd r1d
add ds t2q , r0
% endif
DC_ADD_MMX2_OP mova , ds t2q , r3 , r6
mov ds t_regd , dword [ r1 + r5 * 4 ]
lea ds t2q , [ ds t2q + r3 * 4 ]
add ds t_reg , r0
DC_ADD_MMX2_OP mova , ds t2q , r3 , r6
DC_ADD_MMX2_OP mova , ds t_reg , r3 , r6
lea ds t_reg , [ ds t_reg + r3 * 4 ]
DC_ADD_MMX2_OP mova , ds t_reg , r3 , r6
% if ARCH_X86_64 == 0
% if ARCH_X86_64 == 0
mov r1 , r1m
mov r1 , r1m
% endif
% endif
@ -572,9 +562,9 @@ INIT_MMX
REP_RET
REP_RET
.no_dc
.no_dc
INIT_XMM
INIT_XMM
mov ds t_reg d, dword [ r1 + r5 * 4 ]
mov ds t2 d , dword [ r1 + r5 * 4 ]
add ds t_reg , r0
add ds t2q , r0
IDCT8_ADD_SSE ds t_reg , r2 , r3 , r6
IDCT8_ADD_SSE ds t2q , r2 , r3 , r6
% if ARCH_X86_64 == 0
% if ARCH_X86_64 == 0
mov r1 , r1m
mov r1 , r1m
% endif
% endif
@ -595,7 +585,7 @@ h264_idct_add8_mmx_plane:
jz .skipblock
jz .skipblock
% if ARCH_X86_64
% if ARCH_X86_64
mov r0d , dword [ r1 + r5 * 4 ]
mov r0d , dword [ r1 + r5 * 4 ]
add r0 , [ r10 ]
add r0 , [ ds t2q ]
% else
% else
mov r0 , r1m ; XXX r1m here is actually r0m of the calling func
mov r0 , r1m ; XXX r1m here is actually r0m of the calling func
mov r0 , [ r0 ]
mov r0 , [ r0 ]
@ -611,20 +601,20 @@ h264_idct_add8_mmx_plane:
; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset,
; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
cglobal h264_idct_add8_8_mmx , 5 , 7 , 0
cglobal h264_idct_add8_8_mmx , 5 , 8 + npicregs , 0 , ds t1 , bl ock_offset , bl ock , stride , nnzc , cntr , coeff , ds t2 , picreg
mov r5 , 16
mov r5 , 16
add r2 , 512
add r2 , 512
% ifdef PIC
% ifdef PIC
lea r11 , [ scan8_mem ]
lea picregq , [ scan8_mem ]
% endif
% endif
% if ARCH_X86_64
% if ARCH_X86_64
mov r10 , r0
mov ds t2q , r0
% endif
% endif
call h264_idct_add8_mmx_plane
call h264_idct_add8_mmx_plane
mov r5 , 32
mov r5 , 32
add r2 , 384
add r2 , 384
% if ARCH_X86_64
% if ARCH_X86_64
add r10 , gprsize
add ds t2q , gprsize
% else
% else
add r0mp , gprsize
add r0mp , gprsize
% endif
% endif
@ -639,7 +629,7 @@ h264_idct_add8_mmx2_plane
jz .try_dc
jz .try_dc
% if ARCH_X86_64
% if ARCH_X86_64
mov r0d , dword [ r1 + r5 * 4 ]
mov r0d , dword [ r1 + r5 * 4 ]
add r0 , [ r10 ]
add r0 , [ ds t2q ]
% else
% else
mov r0 , r1m ; XXX r1m here is actually r0m of the calling func
mov r0 , r1m ; XXX r1m here is actually r0m of the calling func
mov r0 , [ r0 ]
mov r0 , [ r0 ]
@ -658,7 +648,7 @@ h264_idct_add8_mmx2_plane
DC_ADD_MMX2_INIT r2 , r3 , r6
DC_ADD_MMX2_INIT r2 , r3 , r6
% if ARCH_X86_64
% if ARCH_X86_64
mov r0d , dword [ r1 + r5 * 4 ]
mov r0d , dword [ r1 + r5 * 4 ]
add r0 , [ r10 ]
add r0 , [ ds t2q ]
% else
% else
mov r0 , r1m ; XXX r1m here is actually r0m of the calling func
mov r0 , r1m ; XXX r1m here is actually r0m of the calling func
mov r0 , [ r0 ]
mov r0 , [ r0 ]
@ -674,20 +664,20 @@ h264_idct_add8_mmx2_plane
; ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset,
; ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
cglobal h264_idct_add8_8_mmx2 , 5 , 7 , 0
cglobal h264_idct_add8_8_mmx2 , 5 , 8 + npicregs , 0 , ds t1 , bl ock_offset , bl ock , stride , nnzc , cntr , coeff , ds t2 , picreg
mov r5 , 16
mov r5 , 16
add r2 , 512
add r2 , 512
% if ARCH_X86_64
% if ARCH_X86_64
mov r10 , r0
mov ds t2q , r0
% endif
% endif
% ifdef PIC
% ifdef PIC
lea r11 , [ scan8_mem ]
lea picregq , [ scan8_mem ]
% endif
% endif
call h264_idct_add8_mmx2_plane
call h264_idct_add8_mmx2_plane
mov r5 , 32
mov r5 , 32
add r2 , 384
add r2 , 384
% if ARCH_X86_64
% if ARCH_X86_64
add r10 , gprsize
add ds t2q , gprsize
% else
% else
add r0mp , gprsize
add r0mp , gprsize
% endif
% endif
@ -739,7 +729,7 @@ x264_add8x4_idct_sse2:
jz .cycle % 1 end
jz .cycle % 1 end
mov r0d , dword [ r1 +% 1 * 8 ]
mov r0d , dword [ r1 +% 1 * 8 ]
% if ARCH_X86_64
% if ARCH_X86_64
add r0 , r10
add r0 , r5
% else
% else
add r0 , r0m
add r0 , r0m
% endif
% endif
@ -752,9 +742,9 @@ x264_add8x4_idct_sse2:
; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset,
; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
cglobal h264_idct_add16_8_sse2 , 5 , 5 , 8
cglobal h264_idct_add16_8_sse2 , 5 , 5 + ARCH_X86_64 , 8
% if ARCH_X86_64
% if ARCH_X86_64
mov r10 , r0
mov r5 , r0
% endif
% endif
; unrolling of the loop leads to an average performance gain of
; unrolling of the loop leads to an average performance gain of
; 20-25%
; 20-25%
@ -774,7 +764,7 @@ cglobal h264_idct_add16_8_sse2, 5, 5, 8
jz .try % 1 dc
jz .try % 1 dc
mov r0d , dword [ r1 +% 1 * 8 ]
mov r0d , dword [ r1 +% 1 * 8 ]
% if ARCH_X86_64
% if ARCH_X86_64
add r0 , r10
add r0 , r7
% else
% else
add r0 , r0m
add r0 , r0m
% endif
% endif
@ -786,7 +776,7 @@ cglobal h264_idct_add16_8_sse2, 5, 5, 8
jz .cycle % 1 end
jz .cycle % 1 end
mov r0d , dword [ r1 +% 1 * 8 ]
mov r0d , dword [ r1 +% 1 * 8 ]
% if ARCH_X86_64
% if ARCH_X86_64
add r0 , r10
add r0 , r7
% else
% else
add r0 , r0m
add r0 , r0m
% endif
% endif
@ -799,9 +789,9 @@ cglobal h264_idct_add16_8_sse2, 5, 5, 8
; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
cglobal h264_idct_add16intra_8_sse2 , 5 , 7 , 8
cglobal h264_idct_add16intra_8_sse2 , 5 , 7 + ARCH_X86_64 , 8
% if ARCH_X86_64
% if ARCH_X86_64
mov r10 , r0
mov r7 , r0
% endif
% endif
add16intra_sse2_cycle 0 , 0xc
add16intra_sse2_cycle 0 , 0xc
add16intra_sse2_cycle 1 , 0x14
add16intra_sse2_cycle 1 , 0x14
@ -819,7 +809,7 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8
jz .try % 1 dc
jz .try % 1 dc
% if ARCH_X86_64
% if ARCH_X86_64
mov r0d , dword [ r1 + ( % 1 & 1 ) * 8 + 64 * ( 1 + ( % 1 >> 1 ))]
mov r0d , dword [ r1 + ( % 1 & 1 ) * 8 + 64 * ( 1 + ( % 1 >> 1 ))]
add r0 , [ r10 ]
add r0 , [ r7 ]
% else
% else
mov r0 , r0m
mov r0 , r0m
mov r0 , [ r0 ]
mov r0 , [ r0 ]
@ -833,7 +823,7 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8
jz .cycle % 1 end
jz .cycle % 1 end
% if ARCH_X86_64
% if ARCH_X86_64
mov r0d , dword [ r1 + ( % 1 & 1 ) * 8 + 64 * ( 1 + ( % 1 >> 1 ))]
mov r0d , dword [ r1 + ( % 1 & 1 ) * 8 + 64 * ( 1 + ( % 1 >> 1 ))]
add r0 , [ r10 ]
add r0 , [ r7 ]
% else
% else
mov r0 , r0m
mov r0 , r0m
mov r0 , [ r0 ]
mov r0 , [ r0 ]
@ -850,15 +840,15 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8
; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
cglobal h264_idct_add8_8_sse2 , 5 , 7 , 8
cglobal h264_idct_add8_8_sse2 , 5 , 7 + ARCH_X86_64 , 8
add r2 , 512
add r2 , 512
% if ARCH_X86_64
% if ARCH_X86_64
mov r10 , r0
mov r7 , r0
% endif
% endif
add8_sse2_cycle 0 , 0x34
add8_sse2_cycle 0 , 0x34
add8_sse2_cycle 1 , 0x3c
add8_sse2_cycle 1 , 0x3c
% if ARCH_X86_64
% if ARCH_X86_64
add r10 , gprsize
add r7 , gprsize
% else
% else
add r0mp , gprsize
add r0mp , gprsize
% endif
% endif