@ -48,22 +48,6 @@ cextern pw_8
; void ff_pred16x16_vertical_8(uint8_t *src, ptrdiff_t stride)
;-----------------------------------------------------------------------------
INIT_MMX mmx
cglobal pred16x16_vertical_8 , 2 , 3
sub r0 , r1
mov r2 , 8
movq mm0 , [ r0 + 0 ]
movq mm1 , [ r0 + 8 ]
.loop:
movq [ r0 + r1 * 1 + 0 ], mm0
movq [ r0 + r1 * 1 + 8 ], mm1
movq [ r0 + r1 * 2 + 0 ], mm0
movq [ r0 + r1 * 2 + 8 ], mm1
lea r0 , [ r0 + r1 * 2 ]
dec r2
jg .loop
REP_RET
INIT_XMM ss e
cglobal pred16x16_vertical_8 , 2 , 3
sub r0 , r1
@ -114,8 +98,6 @@ cglobal pred16x16_horizontal_8, 2,3
REP_RET
% endmacro
INIT_MMX mmx
PRED16x16_H
INIT_MMX mmxext
PRED16x16_H
INIT_XMM ss se3
@ -154,14 +136,6 @@ cglobal pred16x16_dc_8, 2,7
% endif
SPLATB_REG m0 , r2 , m1
% if mmsize==8
mov r3d , 8
.loop:
mova [ r4 + r1 * 0 + 0 ], m0
mova [ r4 + r1 * 0 + 8 ], m0
mova [ r4 + r1 * 1 + 0 ], m0
mova [ r4 + r1 * 1 + 8 ], m0
% else
mov r3d , 4
.loop:
mova [ r4 + r1 * 0 ], m0
@ -169,15 +143,12 @@ cglobal pred16x16_dc_8, 2,7
lea r4 , [ r4 + r1 * 2 ]
mova [ r4 + r1 * 0 ], m0
mova [ r4 + r1 * 1 ], m0
% endif
lea r4 , [ r4 + r1 * 2 ]
dec r3d
jg .loop
REP_RET
% endmacro
INIT_MMX mmxext
PRED16x16_DC
INIT_XMM ss e2
PRED16x16_DC
INIT_XMM ss se3
@ -187,47 +158,6 @@ PRED16x16_DC
; void ff_pred16x16_tm_vp8_8(uint8_t *src, ptrdiff_t stride)
;-----------------------------------------------------------------------------
% macro PRED16x16_TM 0
cglobal pred16x16_tm_vp8_8 , 2 , 5
sub r0 , r1
pxor mm7 , mm7
movq mm0 , [ r0 + 0 ]
movq mm2 , [ r0 + 8 ]
movq mm1 , mm0
movq mm3 , mm2
punpcklbw mm0 , mm7
punpckhbw mm1 , mm7
punpcklbw mm2 , mm7
punpckhbw mm3 , mm7
movzx r3d , byte [ r0 - 1 ]
mov r4d , 16
.loop:
movzx r2d , byte [ r0 + r1 - 1 ]
sub r2d , r3d
movd mm4 , r2d
SPLATW mm4 , mm4 , 0
movq mm5 , mm4
movq mm6 , mm4
movq mm7 , mm4
paddw mm4 , mm0
paddw mm5 , mm1
paddw mm6 , mm2
paddw mm7 , mm3
packuswb mm4 , mm5
packuswb mm6 , mm7
movq [ r0 + r1 + 0 ], mm4
movq [ r0 + r1 + 8 ], mm6
add r0 , r1
dec r4d
jg .loop
REP_RET
% endmacro
INIT_MMX mmx
PRED16x16_TM
INIT_MMX mmxext
PRED16x16_TM
INIT_XMM ss e2
cglobal pred16x16_tm_vp8_8 , 2 , 6 , 6
sub r0 , r1
@ -311,22 +241,6 @@ cglobal pred16x16_plane_%1_8, 2,9,7
neg r1 ; -stride
movh m0 , [ r0 + r1 - 1 ]
% if mmsize == 8
pxor m4 , m4
movh m1 , [ r0 + r1 + 3 ]
movh m2 , [ r0 + r1 + 8 ]
movh m3 , [ r0 + r1 + 12 ]
punpcklbw m0 , m4
punpcklbw m1 , m4
punpcklbw m2 , m4
punpcklbw m3 , m4
pmullw m0 , [ pw_m8tom1 ]
pmullw m1 , [ pw_m8tom1 + 8 ]
pmullw m2 , [ pw_1to8 ]
pmullw m3 , [ pw_1to8 + 8 ]
paddw m0 , m2
paddw m1 , m3
% else ; mmsize == 16
% if cpuflag(ssse3)
movhps m0 , [ r0 + r1 + 8 ]
pmaddubsw m0 , [ plane_shuf ] ; H coefficients
@ -340,21 +254,10 @@ cglobal pred16x16_plane_%1_8, 2,9,7
paddw m0 , m1
% endif
movhlps m1 , m0
% endif
paddw m0 , m1
% if cpuflag(mmxext)
PSHUFLW m1 , m0 , 0xE
% elif cpuflag(mmx)
mova m1 , m0
psrlq m1 , 32
% endif
paddw m0 , m1
% if cpuflag(mmxext)
PSHUFLW m1 , m0 , 0x1
% elif cpuflag(mmx)
mova m1 , m0
psrlq m1 , 16
% endif
paddw m0 , m1 ; sum of H coefficients
lea r4 , [ r0 + r2 * 8 - 1 ]
@ -496,24 +399,10 @@ cglobal pred16x16_plane_%1_8, 2,9,7
SWAP 0 , 1
% endif
mova m2 , m0
% if mmsize == 8
mova m5 , m0
% endif
pmullw m0 , [ pw_0to7 ] ; 0*H, 1*H, ..., 7*H (words)
% if mmsize == 16
psllw m2 , 3
% else
psllw m5 , 3
psllw m2 , 2
mova m6 , m5
paddw m6 , m2
% endif
paddw m0 , m3 ; a + {0,1,2,3,4,5,6,7}*H
paddw m2 , m0 ; a + {8,9,10,11,12,13,14,15}*H
% if mmsize == 8
paddw m5 , m0 ; a + {8,9,10,11}*H
paddw m6 , m0 ; a + {12,13,14,15}*H
% endif
mov r4 , 8
.loop:
@ -523,20 +412,8 @@ cglobal pred16x16_plane_%1_8, 2,9,7
psraw m4 , 5
packuswb m3 , m4
mova [ r0 ], m3
% if mmsize == 8
mova m3 , m5 ; b[8..11]
mova m4 , m6 ; b[12..15]
psraw m3 , 5
psraw m4 , 5
packuswb m3 , m4
mova [ r0 + 8 ], m3
% endif
paddw m0 , m1
paddw m2 , m1
% if mmsize == 8
paddw m5 , m1
paddw m6 , m1
% endif
mova m3 , m0 ; b[0..7]
mova m4 , m2 ; b[8..15]
@ -544,20 +421,8 @@ cglobal pred16x16_plane_%1_8, 2,9,7
psraw m4 , 5
packuswb m3 , m4
mova [ r0 + r2 ], m3
% if mmsize == 8
mova m3 , m5 ; b[8..11]
mova m4 , m6 ; b[12..15]
psraw m3 , 5
psraw m4 , 5
packuswb m3 , m4
mova [ r0 + r2 + 8 ], m3
% endif
paddw m0 , m1
paddw m2 , m1
% if mmsize == 8
paddw m5 , m1
paddw m6 , m1
% endif
lea r0 , [ r0 + r2 * 2 ]
dec r4
@ -565,14 +430,6 @@ cglobal pred16x16_plane_%1_8, 2,9,7
REP_RET
% endmacro
INIT_MMX mmx
H264_PRED16x16_PLANE h264
H264_PRED16x16_PLANE rv40
H264_PRED16x16_PLANE svq3
INIT_MMX mmxext
H264_PRED16x16_PLANE h264
H264_PRED16x16_PLANE rv40
H264_PRED16x16_PLANE svq3
INIT_XMM ss e2
H264_PRED16x16_PLANE h264
H264_PRED16x16_PLANE rv40
@ -592,14 +449,6 @@ cglobal pred8x8_plane_8, 2,9,7
neg r1 ; -stride
movd m0 , [ r0 + r1 - 1 ]
% if mmsize == 8
pxor m2 , m2
movh m1 , [ r0 + r1 + 4 ]
punpcklbw m0 , m2
punpcklbw m1 , m2
pmullw m0 , [ pw_m4to4 ]
pmullw m1 , [ pw_m4to4 + 8 ]
% else ; mmsize == 16
% if cpuflag(ssse3)
movhps m0 , [ r0 + r1 + 4 ] ; this reads 4 bytes more than necessary
pmaddubsw m0 , [ plane8_shuf ] ; H coefficients
@ -611,25 +460,14 @@ cglobal pred8x8_plane_8, 2,9,7
pmullw m0 , [ pw_m4to4 ]
% endif
movhlps m1 , m0
% endif
paddw m0 , m1
% if notcpuflag(ssse3)
% if cpuflag(mmxext)
PSHUFLW m1 , m0 , 0xE
% elif cpuflag(mmx)
mova m1 , m0
psrlq m1 , 32
% endif
paddw m0 , m1
% endif ; !ssse3
% if cpuflag(mmxext)
PSHUFLW m1 , m0 , 0x1
% elif cpuflag(mmx)
mova m1 , m0
psrlq m1 , 16
% endif
paddw m0 , m1 ; sum of H coefficients
lea r4 , [ r0 + r2 * 4 - 1 ]
@ -699,20 +537,12 @@ cglobal pred8x8_plane_8, 2,9,7
SPLATW m0 , m0 , 0 ; H
SPLATW m1 , m1 , 0 ; V
SPLATW m3 , m3 , 0 ; a
% if mmsize == 8
mova m2 , m0
% endif
pmullw m0 , [ pw_0to7 ] ; 0*H, 1*H, ..., 7*H (words)
paddw m0 , m3 ; a + {0,1,2,3,4,5,6,7}*H
% if mmsize == 8
psllw m2 , 2
paddw m2 , m0 ; a + {4,5,6,7}*H
% endif
mov r4 , 4
ALIGN 16
.loop:
% if mmsize == 16
mova m3 , m0 ; b[0..7]
paddw m0 , m1
psraw m3 , 5
@ -722,24 +552,6 @@ ALIGN 16
packuswb m3 , m4
movh [ r0 ], m3
movhps [ r0 + r2 ], m3
% else ; mmsize == 8
mova m3 , m0 ; b[0..3]
mova m4 , m2 ; b[4..7]
paddw m0 , m1
paddw m2 , m1
psraw m3 , 5
psraw m4 , 5
mova m5 , m0 ; V+b[0..3]
mova m6 , m2 ; V+b[4..7]
paddw m0 , m1
paddw m2 , m1
psraw m5 , 5
psraw m6 , 5
packuswb m3 , m4
packuswb m5 , m6
mova [ r0 ], m3
mova [ r0 + r2 ], m5
% endif
lea r0 , [ r0 + r2 * 2 ]
dec r4
@ -747,10 +559,6 @@ ALIGN 16
REP_RET
% endmacro
INIT_MMX mmx
H264_PRED8x8_PLANE
INIT_MMX mmxext
H264_PRED8x8_PLANE
INIT_XMM ss e2
H264_PRED8x8_PLANE
INIT_XMM ss se3
@ -794,8 +602,6 @@ cglobal pred8x8_horizontal_8, 2,3
REP_RET
% endmacro
INIT_MMX mmx
PRED8x8_H
INIT_MMX mmxext
PRED8x8_H
INIT_MMX ss se3
@ -937,46 +743,6 @@ cglobal pred8x8_dc_rv40_8, 2,7
; void ff_pred8x8_tm_vp8_8(uint8_t *src, ptrdiff_t stride)
;-----------------------------------------------------------------------------
% macro PRED8x8_TM 0
cglobal pred8x8_tm_vp8_8 , 2 , 6
sub r0 , r1
pxor mm7 , mm7
movq mm0 , [ r0 ]
movq mm1 , mm0
punpcklbw mm0 , mm7
punpckhbw mm1 , mm7
movzx r4d , byte [ r0 - 1 ]
mov r5d , 4
.loop:
movzx r2d , byte [ r0 + r1 * 1 - 1 ]
movzx r3d , byte [ r0 + r1 * 2 - 1 ]
sub r2d , r4d
sub r3d , r4d
movd mm2 , r2d
movd mm4 , r3d
SPLATW mm2 , mm2 , 0
SPLATW mm4 , mm4 , 0
movq mm3 , mm2
movq mm5 , mm4
paddw mm2 , mm0
paddw mm3 , mm1
paddw mm4 , mm0
paddw mm5 , mm1
packuswb mm2 , mm3
packuswb mm4 , mm5
movq [ r0 + r1 * 1 ], mm2
movq [ r0 + r1 * 2 ], mm4
lea r0 , [ r0 + r1 * 2 ]
dec r5d
jg .loop
REP_RET
% endmacro
INIT_MMX mmx
PRED8x8_TM
INIT_MMX mmxext
PRED8x8_TM
INIT_XMM ss e2
cglobal pred8x8_tm_vp8_8 , 2 , 6 , 4
sub r0 , r1
@ -1333,114 +1099,6 @@ PRED8x8L_VERTICAL
; int has_topright, ptrdiff_t stride)
;-----------------------------------------------------------------------------
INIT_MMX mmxext
cglobal pred8x8l_down_left_8 , 4 , 5
sub r0 , r3
movq mm0 , [ r0 - 8 ]
movq mm3 , [ r0 ]
movq mm1 , [ r0 + 8 ]
movq mm2 , mm3
movq mm4 , mm3
PALIGNR mm2 , mm0 , 7 , mm0
PALIGNR mm1 , mm4 , 1 , mm4
test r1d , r1d
jz .fix_lt_2
test r2d , r2d
jz .fix_tr_1
jmp .do_top
.fix_lt_2:
movq mm5 , mm3
pxor mm5 , mm2
psllq mm5 , 56
psrlq mm5 , 56
pxor mm2 , mm5
test r2d , r2d
jnz .do_top
.fix_tr_1:
movq mm5 , mm3
pxor mm5 , mm1
psrlq mm5 , 56
psllq mm5 , 56
pxor mm1 , mm5
jmp .do_top
.fix_tr_2:
punpckhbw mm3 , mm3
pshufw mm1 , mm3 , 0xFF
jmp .do_topright
.do_top:
PRED4x4_LOWPASS mm4 , mm2 , mm1 , mm3 , mm5
movq mm7 , mm4
test r2d , r2d
jz .fix_tr_2
movq mm0 , [ r0 + 8 ]
movq mm5 , mm0
movq mm2 , mm0
movq mm4 , mm0
psrlq mm5 , 56
PALIGNR mm2 , mm3 , 7 , mm3
PALIGNR mm5 , mm4 , 1 , mm4
PRED4x4_LOWPASS mm1 , mm2 , mm5 , mm0 , mm4
.do_topright:
lea r1 , [ r0 + r3 * 2 ]
movq mm6 , mm1
psrlq mm1 , 56
movq mm4 , mm1
lea r2 , [ r1 + r3 * 2 ]
movq mm2 , mm6
PALIGNR mm2 , mm7 , 1 , mm0
movq mm3 , mm6
PALIGNR mm3 , mm7 , 7 , mm0
PALIGNR mm4 , mm6 , 1 , mm0
movq mm5 , mm7
movq mm1 , mm7
movq mm7 , mm6
lea r4 , [ r2 + r3 * 2 ]
psllq mm1 , 8
PRED4x4_LOWPASS mm0 , mm1 , mm2 , mm5 , mm6
PRED4x4_LOWPASS mm1 , mm3 , mm4 , mm7 , mm6
movq [ r4 + r3 * 2 ], mm1
movq mm2 , mm0
psllq mm1 , 8
psrlq mm2 , 56
psllq mm0 , 8
por mm1 , mm2
movq [ r4 + r3 * 1 ], mm1
movq mm2 , mm0
psllq mm1 , 8
psrlq mm2 , 56
psllq mm0 , 8
por mm1 , mm2
movq [ r2 + r3 * 2 ], mm1
movq mm2 , mm0
psllq mm1 , 8
psrlq mm2 , 56
psllq mm0 , 8
por mm1 , mm2
movq [ r2 + r3 * 1 ], mm1
movq mm2 , mm0
psllq mm1 , 8
psrlq mm2 , 56
psllq mm0 , 8
por mm1 , mm2
movq [ r1 + r3 * 2 ], mm1
movq mm2 , mm0
psllq mm1 , 8
psrlq mm2 , 56
psllq mm0 , 8
por mm1 , mm2
movq [ r1 + r3 * 1 ], mm1
movq mm2 , mm0
psllq mm1 , 8
psrlq mm2 , 56
psllq mm0 , 8
por mm1 , mm2
movq [ r0 + r3 * 2 ], mm1
psllq mm1 , 8
psrlq mm0 , 56
por mm1 , mm0
movq [ r0 + r3 * 1 ], mm1
RET
% macro PRED8x8L_DOWN_LEFT 0
cglobal pred8x8l_down_left_8 , 4 , 4
sub r0 , r3
@ -1530,142 +1188,10 @@ INIT_MMX ssse3
PRED8x8L_DOWN_LEFT
;-----------------------------------------------------------------------------
; void ff_pred8x8l_down_right_8_mmxext (uint8_t *src, int has_topleft,
; int has_topright, ptrdiff_t stride)
; void ff_pred8x8l_down_right_8(uint8_t *src, int has_topleft,
; int has_topright, ptrdiff_t stride)
;-----------------------------------------------------------------------------
INIT_MMX mmxext
cglobal pred8x8l_down_right_8 , 4 , 5
sub r0 , r3
lea r4 , [ r0 + r3 * 2 ]
movq mm0 , [ r0 + r3 * 1 - 8 ]
punpckhbw mm0 , [ r0 + r3 * 0 - 8 ]
movq mm1 , [ r4 + r3 * 1 - 8 ]
punpckhbw mm1 , [ r0 + r3 * 2 - 8 ]
mov r4 , r0
punpckhwd mm1 , mm0
lea r0 , [ r0 + r3 * 4 ]
movq mm2 , [ r0 + r3 * 1 - 8 ]
punpckhbw mm2 , [ r0 + r3 * 0 - 8 ]
lea r0 , [ r0 + r3 * 2 ]
movq mm3 , [ r0 + r3 * 1 - 8 ]
punpckhbw mm3 , [ r0 + r3 * 0 - 8 ]
punpckhwd mm3 , mm2
punpckhdq mm3 , mm1
lea r0 , [ r0 + r3 * 2 ]
movq mm0 , [ r0 + r3 * 0 - 8 ]
movq mm1 , [ r4 ]
mov r0 , r4
movq mm4 , mm3
movq mm2 , mm3
PALIGNR mm4 , mm0 , 7 , mm0
PALIGNR mm1 , mm2 , 1 , mm2
test r1d , r1d ; top_left
jz .fix_lt_1
.do_left:
movq mm0 , mm4
PRED4x4_LOWPASS mm2 , mm1 , mm4 , mm3 , mm5
movq mm4 , mm0
movq mm7 , mm2
movq mm6 , mm2
PRED4x4_LOWPASS mm1 , mm3 , mm0 , mm4 , mm5
psllq mm1 , 56
PALIGNR mm7 , mm1 , 7 , mm3
movq mm0 , [ r0 - 8 ]
movq mm3 , [ r0 ]
movq mm1 , [ r0 + 8 ]
movq mm2 , mm3
movq mm4 , mm3
PALIGNR mm2 , mm0 , 7 , mm0
PALIGNR mm1 , mm4 , 1 , mm4
test r1d , r1d ; top_left
jz .fix_lt_2
test r2d , r2d ; top_right
jz .fix_tr_1
.do_top:
PRED4x4_LOWPASS mm4 , mm2 , mm1 , mm3 , mm5
movq mm5 , mm4
jmp .body
.fix_lt_1:
movq mm5 , mm3
pxor mm5 , mm4
psrlq mm5 , 56
psllq mm5 , 48
pxor mm1 , mm5
jmp .do_left
.fix_lt_2:
movq mm5 , mm3
pxor mm5 , mm2
psllq mm5 , 56
psrlq mm5 , 56
pxor mm2 , mm5
test r2d , r2d ; top_right
jnz .do_top
.fix_tr_1:
movq mm5 , mm3
pxor mm5 , mm1
psrlq mm5 , 56
psllq mm5 , 56
pxor mm1 , mm5
jmp .do_top
.body:
lea r1 , [ r0 + r3 * 2 ]
movq mm1 , mm7
movq mm7 , mm5
movq mm5 , mm6
movq mm2 , mm7
lea r2 , [ r1 + r3 * 2 ]
PALIGNR mm2 , mm6 , 1 , mm0
movq mm3 , mm7
PALIGNR mm3 , mm6 , 7 , mm0
movq mm4 , mm7
lea r4 , [ r2 + r3 * 2 ]
psrlq mm4 , 8
PRED4x4_LOWPASS mm0 , mm1 , mm2 , mm5 , mm6
PRED4x4_LOWPASS mm1 , mm3 , mm4 , mm7 , mm6
movq [ r4 + r3 * 2 ], mm0
movq mm2 , mm1
psrlq mm0 , 8
psllq mm2 , 56
psrlq mm1 , 8
por mm0 , mm2
movq [ r4 + r3 * 1 ], mm0
movq mm2 , mm1
psrlq mm0 , 8
psllq mm2 , 56
psrlq mm1 , 8
por mm0 , mm2
movq [ r2 + r3 * 2 ], mm0
movq mm2 , mm1
psrlq mm0 , 8
psllq mm2 , 56
psrlq mm1 , 8
por mm0 , mm2
movq [ r2 + r3 * 1 ], mm0
movq mm2 , mm1
psrlq mm0 , 8
psllq mm2 , 56
psrlq mm1 , 8
por mm0 , mm2
movq [ r1 + r3 * 2 ], mm0
movq mm2 , mm1
psrlq mm0 , 8
psllq mm2 , 56
psrlq mm1 , 8
por mm0 , mm2
movq [ r1 + r3 * 1 ], mm0
movq mm2 , mm1
psrlq mm0 , 8
psllq mm2 , 56
psrlq mm1 , 8
por mm0 , mm2
movq [ r0 + r3 * 2 ], mm0
psrlq mm0 , 8
psllq mm1 , 56
por mm0 , mm1
movq [ r0 + r3 * 1 ], mm0
RET
% macro PRED8x8L_DOWN_RIGHT 0
cglobal pred8x8l_down_right_8 , 4 , 5
sub r0 , r3
@ -1786,113 +1312,6 @@ PRED8x8L_DOWN_RIGHT
; int has_topright, ptrdiff_t stride)
;-----------------------------------------------------------------------------
INIT_MMX mmxext
cglobal pred8x8l_vertical_right_8 , 4 , 5
sub r0 , r3
lea r4 , [ r0 + r3 * 2 ]
movq mm0 , [ r0 + r3 * 1 - 8 ]
punpckhbw mm0 , [ r0 + r3 * 0 - 8 ]
movq mm1 , [ r4 + r3 * 1 - 8 ]
punpckhbw mm1 , [ r0 + r3 * 2 - 8 ]
mov r4 , r0
punpckhwd mm1 , mm0
lea r0 , [ r0 + r3 * 4 ]
movq mm2 , [ r0 + r3 * 1 - 8 ]
punpckhbw mm2 , [ r0 + r3 * 0 - 8 ]
lea r0 , [ r0 + r3 * 2 ]
movq mm3 , [ r0 + r3 * 1 - 8 ]
punpckhbw mm3 , [ r0 + r3 * 0 - 8 ]
punpckhwd mm3 , mm2
punpckhdq mm3 , mm1
lea r0 , [ r0 + r3 * 2 ]
movq mm0 , [ r0 + r3 * 0 - 8 ]
movq mm1 , [ r4 ]
mov r0 , r4
movq mm4 , mm3
movq mm2 , mm3
PALIGNR mm4 , mm0 , 7 , mm0
PALIGNR mm1 , mm2 , 1 , mm2
test r1d , r1d
jz .fix_lt_1
jmp .do_left
.fix_lt_1:
movq mm5 , mm3
pxor mm5 , mm4
psrlq mm5 , 56
psllq mm5 , 48
pxor mm1 , mm5
jmp .do_left
.fix_lt_2:
movq mm5 , mm3
pxor mm5 , mm2
psllq mm5 , 56
psrlq mm5 , 56
pxor mm2 , mm5
test r2d , r2d
jnz .do_top
.fix_tr_1:
movq mm5 , mm3
pxor mm5 , mm1
psrlq mm5 , 56
psllq mm5 , 56
pxor mm1 , mm5
jmp .do_top
.do_left:
movq mm0 , mm4
PRED4x4_LOWPASS mm2 , mm1 , mm4 , mm3 , mm5
movq mm7 , mm2
movq mm0 , [ r0 - 8 ]
movq mm3 , [ r0 ]
movq mm1 , [ r0 + 8 ]
movq mm2 , mm3
movq mm4 , mm3
PALIGNR mm2 , mm0 , 7 , mm0
PALIGNR mm1 , mm4 , 1 , mm4
test r1d , r1d
jz .fix_lt_2
test r2d , r2d
jz .fix_tr_1
.do_top:
PRED4x4_LOWPASS mm6 , mm2 , mm1 , mm3 , mm5
lea r1 , [ r0 + r3 * 2 ]
movq mm2 , mm6
movq mm3 , mm6
PALIGNR mm3 , mm7 , 7 , mm0
PALIGNR mm6 , mm7 , 6 , mm1
movq mm4 , mm3
pavgb mm3 , mm2
lea r2 , [ r1 + r3 * 2 ]
PRED4x4_LOWPASS mm0 , mm6 , mm2 , mm4 , mm5
movq [ r0 + r3 * 1 ], mm3
movq [ r0 + r3 * 2 ], mm0
movq mm5 , mm0
movq mm6 , mm3
movq mm1 , mm7
movq mm2 , mm1
psllq mm2 , 8
movq mm3 , mm1
psllq mm3 , 16
lea r4 , [ r2 + r3 * 2 ]
PRED4x4_LOWPASS mm0 , mm1 , mm3 , mm2 , mm4
PALIGNR mm6 , mm0 , 7 , mm2
movq [ r1 + r3 * 1 ], mm6
psllq mm0 , 8
PALIGNR mm5 , mm0 , 7 , mm1
movq [ r1 + r3 * 2 ], mm5
psllq mm0 , 8
PALIGNR mm6 , mm0 , 7 , mm2
movq [ r2 + r3 * 1 ], mm6
psllq mm0 , 8
PALIGNR mm5 , mm0 , 7 , mm1
movq [ r2 + r3 * 2 ], mm5
psllq mm0 , 8
PALIGNR mm6 , mm0 , 7 , mm2
movq [ r4 + r3 * 1 ], mm6
psllq mm0 , 8
PALIGNR mm5 , mm0 , 7 , mm1
movq [ r4 + r3 * 2 ], mm5
RET
% macro PRED8x8L_VERTICAL_RIGHT 0
cglobal pred8x8l_vertical_right_8 , 4 , 5 , 7
; manually spill XMM registers for Win64 because
@ -2192,121 +1611,6 @@ PRED8x8L_HORIZONTAL_UP
; int has_topright, ptrdiff_t stride)
;-----------------------------------------------------------------------------
INIT_MMX mmxext
cglobal pred8x8l_horizontal_down_8 , 4 , 5
sub r0 , r3
lea r4 , [ r0 + r3 * 2 ]
movq mm0 , [ r0 + r3 * 1 - 8 ]
punpckhbw mm0 , [ r0 + r3 * 0 - 8 ]
movq mm1 , [ r4 + r3 * 1 - 8 ]
punpckhbw mm1 , [ r0 + r3 * 2 - 8 ]
mov r4 , r0
punpckhwd mm1 , mm0
lea r0 , [ r0 + r3 * 4 ]
movq mm2 , [ r0 + r3 * 1 - 8 ]
punpckhbw mm2 , [ r0 + r3 * 0 - 8 ]
lea r0 , [ r0 + r3 * 2 ]
movq mm3 , [ r0 + r3 * 1 - 8 ]
punpckhbw mm3 , [ r0 + r3 * 0 - 8 ]
punpckhwd mm3 , mm2
punpckhdq mm3 , mm1
lea r0 , [ r0 + r3 * 2 ]
movq mm0 , [ r0 + r3 * 0 - 8 ]
movq mm1 , [ r4 ]
mov r0 , r4
movq mm4 , mm3
movq mm2 , mm3
PALIGNR mm4 , mm0 , 7 , mm0
PALIGNR mm1 , mm2 , 1 , mm2
test r1d , r1d
jnz .do_left
.fix_lt_1:
movq mm5 , mm3
pxor mm5 , mm4
psrlq mm5 , 56
psllq mm5 , 48
pxor mm1 , mm5
jmp .do_left
.fix_lt_2:
movq mm5 , mm3
pxor mm5 , mm2
psllq mm5 , 56
psrlq mm5 , 56
pxor mm2 , mm5
test r2d , r2d
jnz .do_top
.fix_tr_1:
movq mm5 , mm3
pxor mm5 , mm1
psrlq mm5 , 56
psllq mm5 , 56
pxor mm1 , mm5
jmp .do_top
.do_left:
movq mm0 , mm4
PRED4x4_LOWPASS mm2 , mm1 , mm4 , mm3 , mm5
movq mm4 , mm0
movq mm7 , mm2
movq mm6 , mm2
PRED4x4_LOWPASS mm1 , mm3 , mm0 , mm4 , mm5
psllq mm1 , 56
PALIGNR mm7 , mm1 , 7 , mm3
movq mm0 , [ r0 - 8 ]
movq mm3 , [ r0 ]
movq mm1 , [ r0 + 8 ]
movq mm2 , mm3
movq mm4 , mm3
PALIGNR mm2 , mm0 , 7 , mm0
PALIGNR mm1 , mm4 , 1 , mm4
test r1d , r1d
jz .fix_lt_2
test r2d , r2d
jz .fix_tr_1
.do_top:
PRED4x4_LOWPASS mm4 , mm2 , mm1 , mm3 , mm5
movq mm5 , mm4
lea r1 , [ r0 + r3 * 2 ]
psllq mm7 , 56
movq mm2 , mm5
movq mm3 , mm6
movq mm4 , mm2
PALIGNR mm2 , mm6 , 7 , mm5
PALIGNR mm6 , mm7 , 7 , mm0
lea r2 , [ r1 + r3 * 2 ]
PALIGNR mm4 , mm3 , 1 , mm7
movq mm5 , mm3
pavgb mm3 , mm6
PRED4x4_LOWPASS mm0 , mm4 , mm6 , mm5 , mm7
movq mm4 , mm2
movq mm1 , mm2
lea r4 , [ r2 + r3 * 2 ]
psrlq mm4 , 16
psrlq mm1 , 8
PRED4x4_LOWPASS mm6 , mm4 , mm2 , mm1 , mm5
movq mm7 , mm3
punpcklbw mm3 , mm0
punpckhbw mm7 , mm0
movq mm1 , mm7
movq mm0 , mm7
movq mm4 , mm7
movq [ r4 + r3 * 2 ], mm3
PALIGNR mm7 , mm3 , 2 , mm5
movq [ r4 + r3 * 1 ], mm7
PALIGNR mm1 , mm3 , 4 , mm5
movq [ r2 + r3 * 2 ], mm1
PALIGNR mm0 , mm3 , 6 , mm3
movq [ r2 + r3 * 1 ], mm0
movq mm2 , mm6
movq mm3 , mm6
movq [ r1 + r3 * 2 ], mm4
PALIGNR mm6 , mm4 , 2 , mm5
movq [ r1 + r3 * 1 ], mm6
PALIGNR mm2 , mm4 , 4 , mm5
movq [ r0 + r3 * 2 ], mm2
PALIGNR mm3 , mm4 , 6 , mm4
movq [ r0 + r3 * 1 ], mm3
RET
% macro PRED8x8L_HORIZONTAL_DOWN 0
cglobal pred8x8l_horizontal_down_8 , 4 , 5
sub r0 , r3
@ -2472,7 +1776,7 @@ cglobal pred4x4_dc_8, 3,5
; ptrdiff_t stride)
;-----------------------------------------------------------------------------
% macro PRED4x4_TM 0
INIT_MMX mmxext
cglobal pred4x4_tm_vp8_8 , 3 , 6
sub r0 , r2
pxor mm7 , mm7
@ -2487,15 +1791,8 @@ cglobal pred4x4_tm_vp8_8, 3,6
sub r3d , r4d
movd mm2 , r1d
movd mm4 , r3d
% if cpuflag(mmxext)
pshufw mm2 , mm2 , 0
pshufw mm4 , mm4 , 0
% else
punpcklwd mm2 , mm2
punpcklwd mm4 , mm4
punpckldq mm2 , mm2
punpckldq mm4 , mm4
% endif
paddw mm2 , mm0
paddw mm4 , mm0
packuswb mm2 , mm2
@ -2506,12 +1803,6 @@ cglobal pred4x4_tm_vp8_8, 3,6
dec r5d
jg .loop
REP_RET
% endmacro
INIT_MMX mmx
PRED4x4_TM
INIT_MMX mmxext
PRED4x4_TM
INIT_XMM ss se3
cglobal pred4x4_tm_vp8_8 , 3 , 3