@ -44,14 +44,14 @@ sixtap_filter_hw_m: times 4 dw 2, -11
times 4 dw 36 , 108
times 4 dw - 11 , 2
fourtap_filter_hb_m: times 8 db - 6 , - 1
times 8 db 123 , 12
times 8 db - 9 , - 6
times 8 db 93 , 50
times 8 db - 6 , - 9
times 8 db 50 , 93
times 8 db - 1 , - 6
times 8 db 12 , 123
fourtap_filter_hb_m: times 8 db - 6 , 123
times 8 db 12 , - 1
times 8 db - 9 , 93
times 8 db 50 , - 6
times 8 db - 6 , 50
times 8 db 93 , - 9
times 8 db - 1 , 12
times 8 db 123 , - 6
sixtap_filter_hb_m: times 8 db 2 , 1
times 8 db - 11 , 108
@ -136,7 +136,7 @@ bilinear_filter_vb_m: times 8 db 7, 1
% endif
filter_h2_shuf: db 0 , 1 , 1 , 2 , 2 , 3 , 3 , 4 , 4 , 5 , 5 , 6 , 6 , 7 , 7 , 8
filter_h4_shuf: db 0 , 3 , 1 , 4 , 2 , 5 , 3 , 6 , 4 , 7 , 5 , 8 , 6 , 9 , 7 , 10
filter_h4_shuf: db 2 , 3 , 3 , 4 , 4 , 5 , 5 , 6 , 6 , 7 , 7 , 8 , 8 , 9 , 9 , 10
filter_h6_shuf1: db 0 , 5 , 1 , 6 , 2 , 7 , 3 , 8 , 4 , 9 , 5 , 10 , 6 , 11 , 7 , 12
filter_h6_shuf2: db 1 , 2 , 2 , 3 , 3 , 4 , 4 , 5 , 5 , 6 , 6 , 7 , 7 , 8 , 8 , 9
@ -159,6 +159,171 @@ SECTION .text
; int height, int mx, int my);
;-----------------------------------------------------------------------------
% macro FILTER_SSSE3 3
cglobal put_vp8_epel % 1 _h6_ssse3 , 6 , 6 , % 2
lea r5d , [ r5 * 3 ]
mova m3 , [ filter_h6_shuf2 ]
mova m4 , [ filter_h6_shuf3 ]
% ifdef PIC
lea r11 , [ si xtap_filter_hb_m ]
% endif
mova m5 , [ si xtap_filter_hb + r5 * 8 - 48 ] ; set up 6tap filter in bytes
mova m6 , [ si xtap_filter_hb + r5 * 8 - 32 ]
mova m7 , [ si xtap_filter_hb + r5 * 8 - 16 ]
.nextrow
movu m0 , [ r2 - 2 ]
mova m1 , m0
mova m2 , m0
% ifidn %1, 4
; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
; shuffle with a memory operand
punpcklbw m0 , [ r2 + 3 ]
% else
pshufb m0 , [ filter_h6_shuf1 ]
% endif
pshufb m1 , m3
pshufb m2 , m4
pmaddubsw m0 , m5
pmaddubsw m1 , m6
pmaddubsw m2 , m7
paddsw m0 , m1
paddsw m0 , m2
paddsw m0 , [ pw_64 ]
psraw m0 , 7
packuswb m0 , m0
movh [ r0 ], m0 ; store
; go to next line
add r0 , r1
add r2 , r3
dec r4 ; next row
jg .nextrow
REP_RET
cglobal put_vp8_epel % 1 _h4_ssse3 , 6 , 6 , % 3
shl r5d , 4
mova m2 , [ pw_64 ]
mova m3 , [ filter_h2_shuf ]
mova m4 , [ filter_h4_shuf ]
% ifdef PIC
lea r11 , [ fourtap_filter_hb_m ]
% endif
mova m5 , [ fourtap_filter_hb + r5 - 16 ] ; set up 4tap filter in bytes
mova m6 , [ fourtap_filter_hb + r5 ]
.nextrow
movu m0 , [ r2 - 1 ]
mova m1 , m0
pshufb m0 , m3
pshufb m1 , m4
pmaddubsw m0 , m5
pmaddubsw m1 , m6
paddsw m0 , m2
paddsw m0 , m1
psraw m0 , 7
packuswb m0 , m0
movh [ r0 ], m0 ; store
; go to next line
add r0 , r1
add r2 , r3
dec r4 ; next row
jg .nextrow
REP_RET
cglobal put_vp8_epel % 1 _v4_ssse3 , 7 , 7 , % 2
shl r6d , 4
% ifdef PIC
lea r11 , [ fourtap_filter_hb_m ]
% endif
mova m5 , [ fourtap_filter_hb + r6 - 16 ]
mova m6 , [ fourtap_filter_hb + r6 ]
mova m7 , [ pw_64 ]
; read 3 lines
sub r2 , r3
movh m0 , [ r2 ]
movh m1 , [ r2 + r3 ]
movh m2 , [ r2 + 2 * r3 ]
add r2 , r3
.nextrow
movh m3 , [ r2 + 2 * r3 ] ; read new row
mova m4 , m0
mova m0 , m1
punpcklbw m4 , m1
mova m1 , m2
punpcklbw m2 , m3
pmaddubsw m4 , m5
pmaddubsw m2 , m6
paddsw m4 , m2
mova m2 , m3
paddsw m4 , m7
psraw m4 , 7
packuswb m4 , m4
movh [ r0 ], m4
; go to next line
add r0 , r1
add r2 , r3
dec r4 ; next row
jg .nextrow
REP_RET
cglobal put_vp8_epel % 1 _v6_ssse3 , 7 , 7 , % 2
lea r6d , [ r6 * 3 ]
% ifdef PIC
lea r11 , [ si xtap_filter_hb_m ]
% endif
lea r6 , [ si xtap_filter_hb + r6 * 8 ]
; read 5 lines
sub r2 , r3
sub r2 , r3
movh m0 , [ r2 ]
movh m1 , [ r2 + r3 ]
movh m2 , [ r2 + r3 * 2 ]
lea r2 , [ r2 + r3 * 2 ]
add r2 , r3
movh m3 , [ r2 ]
movh m4 , [ r2 + r3 ]
.nextrow
movh m5 , [ r2 + 2 * r3 ] ; read new row
mova m6 , m0
punpcklbw m6 , m5
mova m0 , m1
punpcklbw m1 , m2
mova m7 , m3
punpcklbw m7 , m4
pmaddubsw m6 , [ r6 - 48 ]
pmaddubsw m1 , [ r6 - 32 ]
pmaddubsw m7 , [ r6 - 16 ]
paddsw m6 , m1
paddsw m6 , m7
mova m1 , m2
paddsw m6 , [ pw_64 ]
mova m2 , m3
psraw m6 , 7
mova m3 , m4
packuswb m6 , m6
mova m4 , m5
movh [ r0 ], m6
; go to next line
add r0 , r1
add r2 , r3
dec r4 ; next row
jg .nextrow
REP_RET
% endmacro
INIT_MMX
FILTER_SSSE3 4 , 0 , 0
INIT_XMM
FILTER_SSSE3 8 , 8 , 7
; 4x4 block, H-only 4-tap filter
cglobal put_vp8_epel4_h4_mmxext , 6 , 6
shl r5d , 4
@ -383,72 +548,6 @@ cglobal put_vp8_epel8_h6_sse2, 6, 6, 8
jg .nextrow
REP_RET
cglobal put_vp8_epel8_h4_ssse3 , 6 , 6 , 7
shl r5d , 4
mova m2 , [ pw_64 ]
mova m3 , [ filter_h4_shuf ]
mova m4 , [ filter_h6_shuf2 ]
% ifdef PIC
lea r11 , [ fourtap_filter_hb_m ]
% endif
mova m5 , [ fourtap_filter_hb + r5 - 16 ] ; set up 4tap filter in bytes
mova m6 , [ fourtap_filter_hb + r5 ]
.nextrow
movu m0 , [ r2 - 1 ]
mova m1 , m0
pshufb m0 , m3
pshufb m1 , m4
pmaddubsw m0 , m5
pmaddubsw m1 , m6
paddsw m0 , m2
paddsw m0 , m1
psraw m0 , 7
packuswb m0 , m0
movh [ r0 ], m0 ; store
; go to next line
add r0 , r1
add r2 , r3
dec r4 ; next row
jg .nextrow
REP_RET
cglobal put_vp8_epel8_h6_ssse3 , 6 , 6 , 8
lea r5d , [ r5 * 3 ]
mova m3 , [ filter_h6_shuf1 ]
mova m4 , [ filter_h6_shuf2 ]
% ifdef PIC
lea r11 , [ si xtap_filter_hb_m ]
% endif
mova m5 , [ si xtap_filter_hb + r5 * 8 - 48 ] ; set up 6tap filter in bytes
mova m6 , [ si xtap_filter_hb + r5 * 8 - 32 ]
mova m7 , [ si xtap_filter_hb + r5 * 8 - 16 ]
.nextrow
movu m0 , [ r2 - 2 ]
mova m1 , m0
mova m2 , m0
pshufb m0 , m3
pshufb m1 , m4
pshufb m2 , [ filter_h6_shuf3 ]
pmaddubsw m0 , m5
pmaddubsw m1 , m6
pmaddubsw m2 , m7
paddsw m0 , m1
paddsw m0 , m2
paddsw m0 , [ pw_64 ]
psraw m0 , 7
packuswb m0 , m0
movh [ r0 ], m0 ; store
; go to next line
add r0 , r1
add r2 , r3
dec r4 ; next row
jg .nextrow
REP_RET
% macro FILTER_V 3
; 4x4 block, V-only 4-tap filter
cglobal put_vp8_epel % 2 _v4_ % 1 , 7 , 7 , % 3
@ -573,92 +672,6 @@ FILTER_V mmxext, 4, 0
INIT_XMM
FILTER_V ss e2 , 8 , 8
cglobal put_vp8_epel8_v4_ssse3 , 7 , 7 , 8
shl r6d , 4
% ifdef PIC
lea r11 , [ fourtap_filter_hb_m ]
% endif
mova m5 , [ fourtap_filter_hb + r6 - 16 ]
mova m6 , [ fourtap_filter_hb + r6 ]
mova m7 , [ pw_64 ]
; read 3 lines
sub r2 , r3
movh m0 , [ r2 ]
movh m1 , [ r2 + r3 ]
movh m2 , [ r2 + 2 * r3 ]
add r2 , r3
.nextrow
movh m3 , [ r2 + 2 * r3 ] ; read new row
mova m4 , m0
mova m0 , m1
punpcklbw m4 , m3
punpcklbw m1 , m2
pmaddubsw m4 , m5
pmaddubsw m1 , m6
paddsw m4 , m1
mova m1 , m2
paddsw m4 , m7
mova m2 , m3
psraw m4 , 7
packuswb m4 , m4
movh [ r0 ], m4
; go to next line
add r0 , r1
add r2 , r3
dec r4 ; next row
jg .nextrow
REP_RET
cglobal put_vp8_epel8_v6_ssse3 , 7 , 7 , 8
lea r6d , [ r6 * 3 ]
% ifdef PIC
lea r11 , [ si xtap_filter_hb_m ]
% endif
lea r6 , [ si xtap_filter_hb + r6 * 8 ]
; read 5 lines
sub r2 , r3
sub r2 , r3
movh m0 , [ r2 ]
movh m1 , [ r2 + r3 ]
movh m2 , [ r2 + r3 * 2 ]
lea r2 , [ r2 + r3 * 2 ]
add r2 , r3
movh m3 , [ r2 ]
movh m4 , [ r2 + r3 ]
.nextrow
movh m5 , [ r2 + 2 * r3 ] ; read new row
mova m6 , m0
punpcklbw m6 , m5
mova m0 , m1
punpcklbw m1 , m2
mova m7 , m3
punpcklbw m7 , m4
pmaddubsw m6 , [ r6 - 48 ]
pmaddubsw m1 , [ r6 - 32 ]
pmaddubsw m7 , [ r6 - 16 ]
paddsw m6 , m1
paddsw m6 , m7
mova m1 , m2
paddsw m6 , [ pw_64 ]
mova m2 , m3
psraw m6 , 7
mova m3 , m4
packuswb m6 , m6
mova m4 , m5
movh [ r0 ], m6
; go to next line
add r0 , r1
add r2 , r3
dec r4 ; next row
jg .nextrow
REP_RET
% macro FILTER_BILINEAR 3
cglobal put_vp8_bilinear % 2 _v_ % 1 , 7 , 7 , % 3
mov r5d , 8 * 16