@ -438,48 +438,43 @@ cglobal put_vp8_epel4_h6_mmxext, 6, 6
jg .nextrow
REP_RET
; 4x4 block, H-only 4-tap filter
INIT_XMM
cglobal put_vp8_epel8_h4_sse2 , 6 , 6 , 8
shl r5d , 4
cglobal put_vp8_epel8_h4_sse2 , 6 , 6 , 10
shl r5d , 5
% ifdef PIC
lea r11 , [ fourtap_filter_hw _m ]
lea r11 , [ fourtap_filter_v _m ]
% endif
mova m5 , [ fourtap_filter_hw + r5 - 16 ] ; set up 4tap filter in words
mova m6 , [ fourtap_filter_hw + r5 ]
lea r5 , [ fourtap_filter_v + r5 - 32 ]
pxor m7 , m7
mova m4 , [ pw_64 ]
mova m5 , [ r5 + 0 ]
mova m6 , [ r5 + 16 ]
% ifdef m8
mova m8 , [ r5 + 32 ]
mova m9 , [ r5 + 48 ]
% endif
.nextrow
movh m0 , [ r2 - 1 ]
punpcklbw m0 , m7 ; ABCDEFGH
mova m1 , m0
mova m2 , m0
mova m3 , m0
psrldq m1 , 2 ; BCDEFGH
psrldq m2 , 4 ; CDEFGH
psrldq m3 , 6 ; DEFGH
punpcklwd m0 , m1 ; ABBCCDDE
punpcklwd m2 , m3 ; CDDEEFFG
pmaddwd m0 , m5
pmaddwd m2 , m6
paddd m0 , m2
movh m1 , [ r2 + 3 ]
punpcklbw m1 , m7 ; ABCDEFGH
mova m2 , m1
mova m3 , m1
mova m4 , m1
psrldq m2 , 2 ; BCDEFGH
psrldq m3 , 4 ; CDEFGH
psrldq m4 , 6 ; DEFGH
punpcklwd m1 , m2 ; ABBCCDDE
punpcklwd m3 , m4 ; CDDEEFFG
pmaddwd m1 , m5
pmaddwd m3 , m6
paddd m1 , m3
packssdw m0 , m1
paddsw m0 , [ pw_64 ]
movq m0 , [ r2 - 1 ]
movq m1 , [ r2 - 0 ]
movq m2 , [ r2 + 1 ]
movq m3 , [ r2 + 2 ]
punpcklbw m0 , m7
punpcklbw m1 , m7
punpcklbw m2 , m7
punpcklbw m3 , m7
pmullw m0 , m5
pmullw m1 , m6
% ifdef m8
pmullw m2 , m8
pmullw m3 , m9
% else
pmullw m2 , [ r5 + 32 ]
pmullw m3 , [ r5 + 48 ]
% endif
paddsw m0 , m1
paddsw m2 , m3
paddsw m0 , m2
paddsw m0 , m4
psraw m0 , 7
packuswb m0 , m7
movh [ r0 ], m0 ; store
@ -491,62 +486,57 @@ cglobal put_vp8_epel8_h4_sse2, 6, 6, 8
jg .nextrow
REP_RET
cglobal put_vp8_epel8_h6_sse2 , 6 , 6 , 8
cglobal put_vp8_epel8_h6_sse2 , 6 , 6 , 14
lea r5d , [ r5 * 3 ]
shl r5d , 4
% ifdef PIC
lea r11 , [ si xtap_filter_hw _m ]
lea r11 , [ si xtap_filter_v _m ]
% endif
lea r5 , [ si xtap_filter_hw + r5 * 8 ]
lea r5 , [ si xtap_filter_v + r5 - 96 ]
pxor m7 , m7
mova m6 , [ pw_64 ]
% ifdef m8
mova m8 , [ r5 + 0 ]
mova m9 , [ r5 + 16 ]
mova m10 , [ r5 + 32 ]
mova m11 , [ r5 + 48 ]
mova m12 , [ r5 + 64 ]
mova m13 , [ r5 + 80 ]
% endif
.nextrow
movu m0 , [ r2 - 2 ]
mova m6 , m0
mova m4 , m0
punpcklbw m0 , m7 ; ABCDEFGHI
mova m1 , m0
mova m2 , m0
mova m3 , m0
psrldq m1 , 2 ; BCDEFGH
psrldq m2 , 4 ; CDEFGH
psrldq m3 , 6 ; DEFGH
psrldq m4 , 4
punpcklbw m4 , m7 ; EFGH
mova m5 , m4
psrldq m5 , 2 ; FGH
punpcklwd m0 , m1 ; ABBCCDDE
punpcklwd m2 , m3 ; CDDEEFFG
punpcklwd m4 , m5 ; EFFGGHHI
pmaddwd m0 , [ r5 - 48 ]
pmaddwd m2 , [ r5 - 32 ]
pmaddwd m4 , [ r5 - 16 ]
paddd m0 , m2
paddd m0 , m4
psrldq m6 , 4
mova m4 , m6
punpcklbw m6 , m7 ; ABCDEFGHI
mova m1 , m6
mova m2 , m6
mova m3 , m6
psrldq m1 , 2 ; BCDEFGH
psrldq m2 , 4 ; CDEFGH
psrldq m3 , 6 ; DEFGH
psrldq m4 , 4
punpcklbw m4 , m7 ; EFGH
mova m5 , m4
psrldq m5 , 2 ; FGH
punpcklwd m6 , m1 ; ABBCCDDE
punpcklwd m2 , m3 ; CDDEEFFG
punpcklwd m4 , m5 ; EFFGGHHI
pmaddwd m6 , [ r5 - 48 ]
pmaddwd m2 , [ r5 - 32 ]
pmaddwd m4 , [ r5 - 16 ]
paddd m6 , m2
paddd m6 , m4
packssdw m0 , m6
paddsw m0 , [ pw_64 ]
movq m0 , [ r2 - 2 ]
movq m1 , [ r2 - 1 ]
movq m2 , [ r2 - 0 ]
movq m3 , [ r2 + 1 ]
movq m4 , [ r2 + 2 ]
movq m5 , [ r2 + 3 ]
punpcklbw m0 , m7
punpcklbw m1 , m7
punpcklbw m2 , m7
punpcklbw m3 , m7
punpcklbw m4 , m7
punpcklbw m5 , m7
% ifdef m8
pmullw m0 , m8
pmullw m1 , m9
pmullw m2 , m10
pmullw m3 , m11
pmullw m4 , m12
pmullw m5 , m13
% else
pmullw m0 , [ r5 + 0 ]
pmullw m1 , [ r5 + 16 ]
pmullw m2 , [ r5 + 32 ]
pmullw m3 , [ r5 + 48 ]
pmullw m4 , [ r5 + 64 ]
pmullw m5 , [ r5 + 80 ]
% endif
paddsw m1 , m4
paddsw m0 , m5
paddsw m1 , m2
paddsw m0 , m3
paddsw m0 , m1
paddsw m0 , m6
psraw m0 , 7
packuswb m0 , m7
movh [ r0 ], m0 ; store