@ -29,9 +29,13 @@ SECTION_RODATA
SECTION .text
cextern pw_8
cextern pw_4
cextern pw_2
cextern pw_1
; dest, left, right, src
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
% macro PRED4x4_LOWPASS 4
paddw % 2 , % 3
psrlw % 2 , 1
@ -335,3 +339,930 @@ cglobal pred8x8_horizontal_10_sse2, 2,3
dec r2
jg .loop
REP_RET
;-----------------------------------------------------------------------------
; void predict_8x8_dc(pixel *src, int stride)
;-----------------------------------------------------------------------------
% macro MOV8 2-3
; sort of a hack, but it works
% if mmsize==8
movq [ % 1 + 0 ], % 2
movq [ % 1 + 8 ], % 3
% else
movdqa [ % 1 ], % 2
% endif
% endmacro
% macro PRED8x8_DC 2
cglobal pred8x8_dc_10_ % 1 , 2 , 4
% ifdef ARCH_X86_64
% define t0 r10
% else
% define t0 r0m
% endif
sub r0 , r1
pxor m4 , m4
movq m0 , [ r0 + 0 ]
movq m1 , [ r0 + 8 ]
HADDW m0 , m2
mov t0 , r0
HADDW m1 , m2
movzx r2d , word [ r0 + r1 * 1 - 2 ]
movzx r3d , word [ r0 + r1 * 2 - 2 ]
lea r0 , [ r0 + r1 * 2 ]
add r2d , r3d
movzx r3d , word [ r0 + r1 * 1 - 2 ]
add r2d , r3d
movzx r3d , word [ r0 + r1 * 2 - 2 ]
add r2d , r3d
lea r0 , [ r0 + r1 * 2 ]
movd m2 , r2d ; s2
movzx r2d , word [ r0 + r1 * 1 - 2 ]
movzx r3d , word [ r0 + r1 * 2 - 2 ]
lea r0 , [ r0 + r1 * 2 ]
add r2d , r3d
movzx r3d , word [ r0 + r1 * 1 - 2 ]
add r2d , r3d
movzx r3d , word [ r0 + r1 * 2 - 2 ]
add r2d , r3d
movd m3 , r2d ; s3
punpcklwd m0 , m1
mov r0 , t0
punpcklwd m2 , m3
punpckldq m0 , m2 ; s0, s1, s2, s3
% 2 m3, m0, 11110110b ; s2, s1, s3, s3
lea r2 , [ r1 + r1 * 2 ]
% 2 m0, m0, 01110100b ; s0, s1, s3, s1
paddw m0 , m3
lea r3 , [ r0 + r1 * 4 ]
psrlw m0 , 2
pavgw m0 , m4 ; s0+s2, s1, s3, s1+s3
% ifidn %1, sse2
punpcklwd m0 , m0
pshufd m3 , m0 , 11111010b
punpckldq m0 , m0
SWAP 0 , 1
% else
pshufw m1 , m0 , 0x00
pshufw m2 , m0 , 0x55
pshufw m3 , m0 , 0xaa
pshufw m4 , m0 , 0xff
% endif
MOV8 r0 + r1 * 1 , m1 , m2
MOV8 r0 + r1 * 2 , m1 , m2
MOV8 r0 + r2 * 1 , m1 , m2
MOV8 r0 + r1 * 4 , m1 , m2
MOV8 r3 + r1 * 1 , m3 , m4
MOV8 r3 + r1 * 2 , m3 , m4
MOV8 r3 + r2 * 1 , m3 , m4
MOV8 r3 + r1 * 4 , m3 , m4
RET
% endmacro
INIT_MMX
PRED8x8_DC mmxext , pshufw
INIT_XMM
PRED8x8_DC ss e2 , pshuflw
;-----------------------------------------------------------------------------
; void pred8x8_top_dc(pixel *src, int stride)
;-----------------------------------------------------------------------------
% macro PRED8x8_TOP_DC 2
cglobal pred8x8_top_dc_10_ % 1 , 2 , 4
sub r0 , r1
movq m0 , [ r0 + 0 ]
movq m1 , [ r0 + 8 ]
HADDW m0 , m2
HADDW m1 , m3
lea r2 , [ r1 + r1 * 2 ]
paddw m0 , [ pw_2 ]
paddw m1 , [ pw_2 ]
lea r3 , [ r0 + r1 * 4 ]
psrlw m0 , 2
psrlw m1 , 2
% 2 m0, m0, 0
% 2 m1, m1, 0
% ifidn %1, sse2
punpcklqdq m0 , m1
% endif
MOV8 r0 + r1 * 1 , m0 , m1
MOV8 r0 + r1 * 2 , m0 , m1
MOV8 r0 + r2 * 1 , m0 , m1
MOV8 r0 + r1 * 4 , m0 , m1
MOV8 r3 + r1 * 1 , m0 , m1
MOV8 r3 + r1 * 2 , m0 , m1
MOV8 r3 + r2 * 1 , m0 , m1
MOV8 r3 + r1 * 4 , m0 , m1
RET
% endmacro
INIT_MMX
PRED8x8_TOP_DC mmxext , pshufw
INIT_XMM
PRED8x8_TOP_DC ss e2 , pshuflw
;-----------------------------------------------------------------------------
; void pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
% macro PRED8x8L_TOP_DC 1
cglobal pred8x8l_top_dc_10_ % 1 , 4 , 4 , 6
sub r0 , r3
pxor m7 , m7
mova m0 , [ r0 - 16 ]
mova m3 , [ r0 ]
mova m1 , [ r0 + 16 ]
mova m2 , m3
mova m4 , m3
PALIGNR m2 , m0 , 14 , m0
PALIGNR m1 , m4 , 2 , m4
test r1 , r1 ; top_left
jz .fix_lt_2
test r2 , r2 ; top_right
jz .fix_tr_1
jmp .body
.fix_lt_2:
mova m5 , m3
pxor m5 , m2
pslldq m5 , 14
psrldq m5 , 14
pxor m2 , m5
test r2 , r2 ; top_right
jnz .body
.fix_tr_1:
mova m5 , m3
pxor m5 , m1
psrldq m5 , 14
pslldq m5 , 14
pxor m1 , m5
.body
lea r1 , [ r3 + r3 * 2 ]
lea r2 , [ r0 + r3 * 4 ]
PRED4x4_LOWPASS m0 , m2 , m1 , m3
HADDW m0 , m1
paddw m0 , [ pw_4 ]
psrlw m0 , 3
SPLATW m0 , m0 , 0
mova [ r0 + r3 * 1 ], m0
mova [ r0 + r3 * 2 ], m0
mova [ r0 + r1 * 1 ], m0
mova [ r0 + r3 * 4 ], m0
mova [ r2 + r3 * 1 ], m0
mova [ r2 + r3 * 2 ], m0
mova [ r2 + r1 * 1 ], m0
mova [ r2 + r3 * 4 ], m0
RET
% endmacro
INIT_XMM
% define PALIGNR PALIGNR_MMX
PRED8x8L_TOP_DC ss e2
% define PALIGNR PALIGNR_SSSE3
PRED8x8L_TOP_DC ss se3
;-----------------------------------------------------------------------------
;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
;TODO: see if scalar is faster
% macro PRED8x8L_DC 1
cglobal pred8x8l_dc_10_ % 1 , 4 , 5 , 8
sub r0 , r3
lea r4 , [ r0 + r3 * 2 ]
mova m0 , [ r0 + r3 * 1 - 16 ]
punpckhwd m0 , [ r0 + r3 * 0 - 16 ]
mova m1 , [ r4 + r3 * 1 - 16 ]
punpckhwd m1 , [ r0 + r3 * 2 - 16 ]
mov r4 , r0
punpckhdq m1 , m0
lea r0 , [ r0 + r3 * 4 ]
mova m2 , [ r0 + r3 * 1 - 16 ]
punpckhwd m2 , [ r0 + r3 * 0 - 16 ]
lea r0 , [ r0 + r3 * 2 ]
mova m3 , [ r0 + r3 * 1 - 16 ]
punpckhwd m3 , [ r0 + r3 * 0 - 16 ]
punpckhdq m3 , m2
punpckhqdq m3 , m1
lea r0 , [ r0 + r3 * 2 ]
mova m0 , [ r0 + r3 * 0 - 16 ]
mova m1 , [ r4 ]
mov r0 , r4
mova m4 , m3
mova m2 , m3
PALIGNR m4 , m0 , 14 , m0
PALIGNR m1 , m2 , 2 , m2
test r1 , r1
jnz .do_left
.fix_lt_1:
mova m5 , m3
pxor m5 , m4
psrldq m5 , 14
pslldq m5 , 12
pxor m1 , m5
jmp .do_left
.fix_lt_2:
mova m5 , m3
pxor m5 , m2
pslldq m5 , 14
psrldq m5 , 14
pxor m2 , m5
test r2 , r2
jnz .body
.fix_tr_1:
mova m5 , m3
pxor m5 , m1
psrldq m5 , 14
pslldq m5 , 14
pxor m1 , m5
jmp .body
.do_left:
mova m0 , m4
PRED4x4_LOWPASS m2 , m1 , m4 , m3
mova m4 , m0
mova m7 , m2
PRED4x4_LOWPASS m1 , m3 , m0 , m4
pslldq m1 , 14
PALIGNR m7 , m1 , 14 , m3
mova m0 , [ r0 - 16 ]
mova m3 , [ r0 ]
mova m1 , [ r0 + 16 ]
mova m2 , m3
mova m4 , m3
PALIGNR m2 , m0 , 14 , m0
PALIGNR m1 , m4 , 2 , m4
test r1 , r1
jz .fix_lt_2
test r2 , r2
jz .fix_tr_1
.body
lea r1 , [ r3 + r3 * 2 ]
PRED4x4_LOWPASS m6 , m2 , m1 , m3
HADDW m7 , m0
HADDW m6 , m0
lea r2 , [ r0 + r3 * 4 ]
paddw m7 , [ pw_8 ]
paddw m7 , m6
psrlw m7 , 4
SPLATW m7 , m7
mova [ r0 + r3 * 1 ], m7
mova [ r0 + r3 * 2 ], m7
mova [ r0 + r1 * 1 ], m7
mova [ r0 + r3 * 4 ], m7
mova [ r2 + r3 * 1 ], m7
mova [ r2 + r3 * 2 ], m7
mova [ r2 + r1 * 1 ], m7
mova [ r2 + r3 * 4 ], m7
RET
% endmacro
INIT_XMM
% define PALIGNR PALIGNR_MMX
PRED8x8L_DC ss e2
% define PALIGNR PALIGNR_SSSE3
PRED8x8L_DC ss se3
;-----------------------------------------------------------------------------
; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
% macro PRED8x8L_VERTICAL 1
cglobal pred8x8l_vertical_10_ % 1 , 4 , 4 , 6
sub r0 , r3
mova m0 , [ r0 - 16 ]
mova m3 , [ r0 ]
mova m1 , [ r0 + 16 ]
mova m2 , m3
mova m4 , m3
PALIGNR m2 , m0 , 14 , m0
PALIGNR m1 , m4 , 2 , m4
test r1 , r1 ; top_left
jz .fix_lt_2
test r2 , r2 ; top_right
jz .fix_tr_1
jmp .body
.fix_lt_2:
mova m5 , m3
pxor m5 , m2
pslldq m5 , 14
psrldq m5 , 14
pxor m2 , m5
test r2 , r2 ; top_right
jnz .body
.fix_tr_1:
mova m5 , m3
pxor m5 , m1
psrldq m5 , 14
pslldq m5 , 14
pxor m1 , m5
.body
lea r1 , [ r3 + r3 * 2 ]
lea r2 , [ r0 + r3 * 4 ]
PRED4x4_LOWPASS m0 , m2 , m1 , m3
mova [ r0 + r3 * 1 ], m0
mova [ r0 + r3 * 2 ], m0
mova [ r0 + r1 * 1 ], m0
mova [ r0 + r3 * 4 ], m0
mova [ r2 + r3 * 1 ], m0
mova [ r2 + r3 * 2 ], m0
mova [ r2 + r1 * 1 ], m0
mova [ r2 + r3 * 4 ], m0
RET
% endmacro
INIT_XMM
% define PALIGNR PALIGNR_MMX
PRED8x8L_VERTICAL ss e2
% define PALIGNR PALIGNR_SSSE3
PRED8x8L_VERTICAL ss se3
;-----------------------------------------------------------------------------
; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
% macro PRED8x8L_HORIZONTAL 1
cglobal pred8x8l_horizontal_10_ % 1 , 4 , 4 , 8
sub r0 , r3
lea r2 , [ r0 + r3 * 2 ]
mova m0 , [ r0 + r3 * 1 - 16 ]
test r1 , r1
lea r1 , [ r0 + r3 ]
cmovnz r1 , r0
punpckhwd m0 , [ r1 + r3 * 0 - 16 ]
mova m1 , [ r2 + r3 * 1 - 16 ]
punpckhwd m1 , [ r0 + r3 * 2 - 16 ]
mov r2 , r0
punpckhdq m1 , m0
lea r0 , [ r0 + r3 * 4 ]
mova m2 , [ r0 + r3 * 1 - 16 ]
punpckhwd m2 , [ r0 + r3 * 0 - 16 ]
lea r0 , [ r0 + r3 * 2 ]
mova m3 , [ r0 + r3 * 1 - 16 ]
punpckhwd m3 , [ r0 + r3 * 0 - 16 ]
punpckhdq m3 , m2
punpckhqdq m3 , m1
lea r0 , [ r0 + r3 * 2 ]
mova m0 , [ r0 + r3 * 0 - 16 ]
mova m1 , [ r1 + r3 * 0 - 16 ]
mov r0 , r2
mova m4 , m3
mova m2 , m3
PALIGNR m4 , m0 , 14 , m0
PALIGNR m1 , m2 , 2 , m2
mova m0 , m4
PRED4x4_LOWPASS m2 , m1 , m4 , m3
mova m4 , m0
mova m7 , m2
PRED4x4_LOWPASS m1 , m3 , m0 , m4
pslldq m1 , 14
PALIGNR m7 , m1 , 14 , m3
lea r1 , [ r3 + r3 * 2 ]
punpckhwd m3 , m7 , m7
punpcklwd m7 , m7
pshufd m0 , m3 , 0xff
pshufd m1 , m3 , 0xaa
lea r2 , [ r0 + r3 * 4 ]
pshufd m2 , m3 , 0x55
pshufd m3 , m3 , 0x00
pshufd m4 , m7 , 0xff
pshufd m5 , m7 , 0xaa
pshufd m6 , m7 , 0x55
pshufd m7 , m7 , 0x00
mova [ r0 + r3 * 1 ], m0
mova [ r0 + r3 * 2 ], m1
mova [ r0 + r1 * 1 ], m2
mova [ r0 + r3 * 4 ], m3
mova [ r2 + r3 * 1 ], m4
mova [ r2 + r3 * 2 ], m5
mova [ r2 + r1 * 1 ], m6
mova [ r2 + r3 * 4 ], m7
RET
% endmacro
INIT_XMM
% define PALIGNR PALIGNR_MMX
PRED8x8L_HORIZONTAL ss e2
% define PALIGNR PALIGNR_SSSE3
PRED8x8L_HORIZONTAL ss se3
;-----------------------------------------------------------------------------
;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
% macro PRED8x8L_DOWN_LEFT 1
cglobal pred8x8l_down_left_10_ % 1 , 4 , 4 , 8
sub r0 , r3
mova m0 , [ r0 - 16 ]
mova m3 , [ r0 ]
mova m1 , [ r0 + 16 ]
mova m2 , m3
mova m4 , m3
PALIGNR m2 , m0 , 14 , m0
PALIGNR m1 , m4 , 2 , m4
test r1 , r1
jz .fix_lt_2
test r2 , r2
jz .fix_tr_1
jmp .do_top
.fix_lt_2:
mova m5 , m3
pxor m5 , m2
pslldq m5 , 14
psrldq m5 , 14
pxor m2 , m5
test r2 , r2
jnz .do_top
.fix_tr_1:
mova m5 , m3
pxor m5 , m1
psrldq m5 , 14
pslldq m5 , 14
pxor m1 , m5
jmp .do_top
.fix_tr_2:
punpckhwd m3 , m3
pshufd m1 , m3 , 0xFF
jmp .do_topright
.do_top:
PRED4x4_LOWPASS m4 , m2 , m1 , m3
mova m7 , m4
test r2 , r2
jz .fix_tr_2
mova m0 , [ r0 + 16 ]
mova m5 , m0
mova m2 , m0
mova m4 , m0
psrldq m5 , 14
PALIGNR m2 , m3 , 14 , m3
PALIGNR m5 , m4 , 2 , m4
PRED4x4_LOWPASS m1 , m2 , m5 , m0
.do_topright:
lea r1 , [ r3 + r3 * 2 ]
mova m6 , m1
psrldq m1 , 14
mova m4 , m1
lea r2 , [ r0 + r3 * 4 ]
mova m2 , m6
PALIGNR m2 , m7 , 2 , m0
mova m3 , m6
PALIGNR m3 , m7 , 14 , m0
PALIGNR m4 , m6 , 2 , m0
mova m5 , m7
mova m1 , m7
mova m7 , m6
pslldq m1 , 2
PRED4x4_LOWPASS m0 , m1 , m2 , m5
PRED4x4_LOWPASS m1 , m3 , m4 , m7
mova [ r2 + r3 * 4 ], m1
mova m2 , m0
pslldq m1 , 2
psrldq m2 , 14
pslldq m0 , 2
por m1 , m2
mova [ r2 + r1 * 1 ], m1
mova m2 , m0
pslldq m1 , 2
psrldq m2 , 14
pslldq m0 , 2
por m1 , m2
mova [ r2 + r3 * 2 ], m1
mova m2 , m0
pslldq m1 , 2
psrldq m2 , 14
pslldq m0 , 2
por m1 , m2
mova [ r2 + r3 * 1 ], m1
mova m2 , m0
pslldq m1 , 2
psrldq m2 , 14
pslldq m0 , 2
por m1 , m2
mova [ r0 + r3 * 4 ], m1
mova m2 , m0
pslldq m1 , 2
psrldq m2 , 14
pslldq m0 , 2
por m1 , m2
mova [ r0 + r1 * 1 ], m1
mova m2 , m0
pslldq m1 , 2
psrldq m2 , 14
pslldq m0 , 2
por m1 , m2
mova [ r0 + r3 * 2 ], m1
pslldq m1 , 2
psrldq m0 , 14
por m1 , m0
mova [ r0 + r3 * 1 ], m1
RET
% endmacro
INIT_XMM
% define PALIGNR PALIGNR_MMX
PRED8x8L_DOWN_LEFT ss e2
% define PALIGNR PALIGNR_SSSE3
PRED8x8L_DOWN_LEFT ss se3
;-----------------------------------------------------------------------------
;void pred8x8l_down_right_mxext(pixel *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
% macro PRED8x8L_DOWN_RIGHT 1
cglobal pred8x8l_down_right_10_ % 1 , 4 , 5 , 8
sub r0 , r3
lea r4 , [ r0 + r3 * 2 ]
mova m0 , [ r0 + r3 * 1 - 16 ]
punpckhwd m0 , [ r0 + r3 * 0 - 16 ]
mova m1 , [ r4 + r3 * 1 - 16 ]
punpckhwd m1 , [ r0 + r3 * 2 - 16 ]
mov r4 , r0
punpckhdq m1 , m0
lea r0 , [ r0 + r3 * 4 ]
mova m2 , [ r0 + r3 * 1 - 16 ]
punpckhwd m2 , [ r0 + r3 * 0 - 16 ]
lea r0 , [ r0 + r3 * 2 ]
mova m3 , [ r0 + r3 * 1 - 16 ]
punpckhwd m3 , [ r0 + r3 * 0 - 16 ]
punpckhdq m3 , m2
punpckhqdq m3 , m1
lea r0 , [ r0 + r3 * 2 ]
mova m0 , [ r0 + r3 * 0 - 16 ]
mova m1 , [ r4 ]
mov r0 , r4
mova m4 , m3
mova m2 , m3
PALIGNR m4 , m0 , 14 , m0
PALIGNR m1 , m2 , 2 , m2
test r1 , r1 ; top_left
jz .fix_lt_1
.do_left:
mova m0 , m4
PRED4x4_LOWPASS m2 , m1 , m4 , m3
mova m4 , m0
mova m7 , m2
mova m6 , m2
PRED4x4_LOWPASS m1 , m3 , m0 , m4
pslldq m1 , 14
PALIGNR m7 , m1 , 14 , m3
mova m0 , [ r0 - 16 ]
mova m3 , [ r0 ]
mova m1 , [ r0 + 16 ]
mova m2 , m3
mova m4 , m3
PALIGNR m2 , m0 , 14 , m0
PALIGNR m1 , m4 , 2 , m4
test r1 , r1 ; top_left
jz .fix_lt_2
test r2 , r2 ; top_right
jz .fix_tr_1
.do_top:
PRED4x4_LOWPASS m4 , m2 , m1 , m3
mova m5 , m4
jmp .body
.fix_lt_1:
mova m5 , m3
pxor m5 , m4
psrldq m5 , 14
pslldq m5 , 12
pxor m1 , m5
jmp .do_left
.fix_lt_2:
mova m5 , m3
pxor m5 , m2
pslldq m5 , 14
psrldq m5 , 14
pxor m2 , m5
test r2 , r2 ; top_right
jnz .do_top
.fix_tr_1:
mova m5 , m3
pxor m5 , m1
psrldq m5 , 14
pslldq m5 , 14
pxor m1 , m5
jmp .do_top
.body
lea r1 , [ r3 + r3 * 2 ]
mova m1 , m7
mova m7 , m5
mova m5 , m6
mova m2 , m7
lea r2 , [ r0 + r3 * 4 ]
PALIGNR m2 , m6 , 2 , m0
mova m3 , m7
PALIGNR m3 , m6 , 14 , m0
mova m4 , m7
psrldq m4 , 2
PRED4x4_LOWPASS m0 , m1 , m2 , m5
PRED4x4_LOWPASS m1 , m3 , m4 , m7
mova [ r2 + r3 * 4 ], m0
mova m2 , m1
psrldq m0 , 2
pslldq m2 , 14
psrldq m1 , 2
por m0 , m2
mova [ r2 + r1 * 1 ], m0
mova m2 , m1
psrldq m0 , 2
pslldq m2 , 14
psrldq m1 , 2
por m0 , m2
mova [ r2 + r3 * 2 ], m0
mova m2 , m1
psrldq m0 , 2
pslldq m2 , 14
psrldq m1 , 2
por m0 , m2
mova [ r2 + r3 * 1 ], m0
mova m2 , m1
psrldq m0 , 2
pslldq m2 , 14
psrldq m1 , 2
por m0 , m2
mova [ r0 + r3 * 4 ], m0
mova m2 , m1
psrldq m0 , 2
pslldq m2 , 14
psrldq m1 , 2
por m0 , m2
mova [ r0 + r1 * 1 ], m0
mova m2 , m1
psrldq m0 , 2
pslldq m2 , 14
psrldq m1 , 2
por m0 , m2
mova [ r0 + r3 * 2 ], m0
psrldq m0 , 2
pslldq m1 , 14
por m0 , m1
mova [ r0 + r3 * 1 ], m0
RET
% endmacro
INIT_XMM
% define PALIGNR PALIGNR_MMX
PRED8x8L_DOWN_RIGHT ss e2
% define PALIGNR PALIGNR_SSSE3
PRED8x8L_DOWN_RIGHT ss se3
;-----------------------------------------------------------------------------
; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
% macro PRED8x8L_VERTICAL_RIGHT 1
cglobal pred8x8l_vertical_right_10_ % 1 , 4 , 5 , 8
sub r0 , r3
lea r4 , [ r0 + r3 * 2 ]
mova m0 , [ r0 + r3 * 1 - 16 ]
punpckhwd m0 , [ r0 + r3 * 0 - 16 ]
mova m1 , [ r4 + r3 * 1 - 16 ]
punpckhwd m1 , [ r0 + r3 * 2 - 16 ]
mov r4 , r0
punpckhdq m1 , m0
lea r0 , [ r0 + r3 * 4 ]
mova m2 , [ r0 + r3 * 1 - 16 ]
punpckhwd m2 , [ r0 + r3 * 0 - 16 ]
lea r0 , [ r0 + r3 * 2 ]
mova m3 , [ r0 + r3 * 1 - 16 ]
punpckhwd m3 , [ r0 + r3 * 0 - 16 ]
punpckhdq m3 , m2
punpckhqdq m3 , m1
lea r0 , [ r0 + r3 * 2 ]
mova m0 , [ r0 + r3 * 0 - 16 ]
mova m1 , [ r4 ]
mov r0 , r4
mova m4 , m3
mova m2 , m3
PALIGNR m4 , m0 , 14 , m0
PALIGNR m1 , m2 , 2 , m2
test r1 , r1
jz .fix_lt_1
jmp .do_left
.fix_lt_1:
mova m5 , m3
pxor m5 , m4
psrldq m5 , 14
pslldq m5 , 12
pxor m1 , m5
jmp .do_left
.fix_lt_2:
mova m5 , m3
pxor m5 , m2
pslldq m5 , 14
psrldq m5 , 14
pxor m2 , m5
test r2 , r2
jnz .do_top
.fix_tr_1:
mova m5 , m3
pxor m5 , m1
psrldq m5 , 14
pslldq m5 , 14
pxor m1 , m5
jmp .do_top
.do_left:
mova m0 , m4
PRED4x4_LOWPASS m2 , m1 , m4 , m3
mova m7 , m2
mova m0 , [ r0 - 16 ]
mova m3 , [ r0 ]
mova m1 , [ r0 + 16 ]
mova m2 , m3
mova m4 , m3
PALIGNR m2 , m0 , 14 , m0
PALIGNR m1 , m4 , 2 , m4
test r1 , r1
jz .fix_lt_2
test r2 , r2
jz .fix_tr_1
.do_top
PRED4x4_LOWPASS m6 , m2 , m1 , m3
lea r1 , [ r3 + r3 * 2 ]
mova m2 , m6
mova m3 , m6
PALIGNR m3 , m7 , 14 , m0
PALIGNR m6 , m7 , 12 , m1
mova m4 , m3
pavgw m3 , m2
lea r2 , [ r0 + r3 * 4 ]
PRED4x4_LOWPASS m0 , m6 , m2 , m4
mova [ r0 + r3 * 1 ], m3
mova [ r0 + r3 * 2 ], m0
mova m5 , m0
mova m6 , m3
mova m1 , m7
mova m2 , m1
pslldq m2 , 2
mova m3 , m1
pslldq m3 , 4
PRED4x4_LOWPASS m0 , m1 , m3 , m2
PALIGNR m6 , m0 , 14 , m2
mova [ r0 + r1 * 1 ], m6
pslldq m0 , 2
PALIGNR m5 , m0 , 14 , m1
mova [ r0 + r3 * 4 ], m5
pslldq m0 , 2
PALIGNR m6 , m0 , 14 , m2
mova [ r2 + r3 * 1 ], m6
pslldq m0 , 2
PALIGNR m5 , m0 , 14 , m1
mova [ r2 + r3 * 2 ], m5
pslldq m0 , 2
PALIGNR m6 , m0 , 14 , m2
mova [ r2 + r1 * 1 ], m6
pslldq m0 , 2
PALIGNR m5 , m0 , 14 , m1
mova [ r2 + r3 * 4 ], m5
RET
% endmacro
INIT_XMM
% define PALIGNR PALIGNR_MMX
PRED8x8L_VERTICAL_RIGHT ss e2
% define PALIGNR PALIGNR_SSSE3
PRED8x8L_VERTICAL_RIGHT ss se3
;-----------------------------------------------------------------------------
; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
% macro PRED8x8L_HORIZONTAL_UP 1
cglobal pred8x8l_horizontal_up_10_ % 1 , 4 , 4 , 8
sub r0 , r3
lea r2 , [ r0 + r3 * 2 ]
mova m0 , [ r0 + r3 * 1 - 16 ]
test r1 , r1
lea r1 , [ r0 + r3 ]
cmovnz r1 , r0
punpckhwd m0 , [ r1 + r3 * 0 - 16 ]
mova m1 , [ r2 + r3 * 1 - 16 ]
punpckhwd m1 , [ r0 + r3 * 2 - 16 ]
mov r2 , r0
punpckhdq m1 , m0
lea r0 , [ r0 + r3 * 4 ]
mova m2 , [ r0 + r3 * 1 - 16 ]
punpckhwd m2 , [ r0 + r3 * 0 - 16 ]
lea r0 , [ r0 + r3 * 2 ]
mova m3 , [ r0 + r3 * 1 - 16 ]
punpckhwd m3 , [ r0 + r3 * 0 - 16 ]
punpckhdq m3 , m2
punpckhqdq m3 , m1
lea r0 , [ r0 + r3 * 2 ]
mova m0 , [ r0 + r3 * 0 - 16 ]
mova m1 , [ r1 + r3 * 0 - 16 ]
mov r0 , r2
mova m4 , m3
mova m2 , m3
PALIGNR m4 , m0 , 14 , m0
PALIGNR m1 , m2 , 2 , m2
mova m0 , m4
PRED4x4_LOWPASS m2 , m1 , m4 , m3
mova m4 , m0
mova m7 , m2
PRED4x4_LOWPASS m1 , m3 , m0 , m4
pslldq m1 , 14
PALIGNR m7 , m1 , 14 , m3
lea r1 , [ r3 + r3 * 2 ]
pshufd m0 , m7 , 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
pslldq m7 , 14 ; l7 .. .. .. .. .. .. ..
mova m2 , m0
pslld m0 , 16
psrld m2 , 16
por m2 , m0 ; l7 l6 l5 l4 l3 l2 l1 l0
mova m3 , m2
mova m4 , m2
mova m5 , m2
psrldq m2 , 2
psrldq m3 , 4
lea r2 , [ r0 + r3 * 4 ]
por m2 , m7 ; l7 l7 l6 l5 l4 l3 l2 l1
punpckhwd m7 , m7
por m3 , m7 ; l7 l7 l7 l6 l5 l4 l3 l2
pavgw m4 , m2
PRED4x4_LOWPASS m1 , m3 , m5 , m2
mova m5 , m4
punpcklwd m4 , m1 ; p4 p3 p2 p1
punpckhwd m5 , m1 ; p8 p7 p6 p5
mova m6 , m5
mova m7 , m5
mova m0 , m5
PALIGNR m5 , m4 , 4 , m1
pshufd m1 , m6 , 11111001b
PALIGNR m6 , m4 , 8 , m2
pshufd m2 , m7 , 11111110b
PALIGNR m7 , m4 , 12 , m3
pshufd m3 , m0 , 11111111b
mova [ r0 + r3 * 1 ], m4
mova [ r0 + r3 * 2 ], m5
mova [ r0 + r1 * 1 ], m6
mova [ r0 + r3 * 4 ], m7
mova [ r2 + r3 * 1 ], m0
mova [ r2 + r3 * 2 ], m1
mova [ r2 + r1 * 1 ], m2
mova [ r2 + r3 * 4 ], m3
RET
% endmacro
INIT_XMM
% define PALIGNR PALIGNR_MMX
PRED8x8L_HORIZONTAL_UP ss e2
% define PALIGNR PALIGNR_SSSE3
PRED8x8L_HORIZONTAL_UP ss se3
;-----------------------------------------------------------------------------
; void pred16x16_vertical(pixel *src, int stride)
;-----------------------------------------------------------------------------
% macro MOV16 3-5
mova [ % 1 + 0 ], % 2
mova [ % 1 + mmsize ], % 3
% if mmsize==8
mova [ % 1 + 16 ], % 4
mova [ % 1 + 24 ], % 5
% endif
% endmacro
% macro PRED16x16_VERTICAL 1
cglobal pred16x16_vertical_10_ % 1 , 2 , 3
sub r0 , r1
mov r2 , 8
mova m0 , [ r0 + 0 ]
mova m1 , [ r0 + mmsize ]
% if mmsize==8
mova m2 , [ r0 + 16 ]
mova m3 , [ r0 + 24 ]
% endif
.loop:
MOV16 r0 + r1 * 1 , m0 , m1 , m2 , m3
MOV16 r0 + r1 * 2 , m0 , m1 , m2 , m3
lea r0 , [ r0 + r1 * 2 ]
dec r2
jg .loop
REP_RET
% endmacro
INIT_MMX
PRED16x16_VERTICAL mmxext
INIT_XMM
PRED16x16_VERTICAL ss e2
;-----------------------------------------------------------------------------
; void pred16x16_horizontal(pixel *src, int stride)
;-----------------------------------------------------------------------------
% macro PRED16x16_HORIZONTAL 1
cglobal pred16x16_horizontal_10_ % 1 , 2 , 3
mov r2 , 8
.vloop:
movd m0 , [ r0 + r1 * 0 - 4 ]
movd m1 , [ r0 + r1 * 1 - 4 ]
SPLATW m0 , m0 , 1
SPLATW m1 , m1 , 1
MOV16 r0 + r1 * 0 , m0 , m0 , m0 , m0
MOV16 r0 + r1 * 1 , m1 , m1 , m1 , m1
lea r0 , [ r0 + r1 * 2 ]
dec r2
jge .vloop
REP_RET
% endmacro
INIT_MMX
PRED16x16_HORIZONTAL mmxext
INIT_XMM
PRED16x16_HORIZONTAL ss e2