@ -29,8 +29,6 @@ pb_zzzzzzzz77777777: times 8 db -1
pb_7: times 8 db 7
pb_zzzz3333zzzzbbbb: db - 1 , - 1 , - 1 , - 1 , 3 , 3 , 3 , 3 , - 1 , - 1 , - 1 , - 1 , 11 , 11 , 11 , 11
pb_zz11zz55zz99zzdd: db - 1 , - 1 , 1 , 1 , - 1 , - 1 , 5 , 5 , - 1 , - 1 , 9 , 9 , - 1 , - 1 , 13 , 13
pb_revwords: SHUFFLE_MASK_W 7 , 6 , 5 , 4 , 3 , 2 , 1 , 0
pd_16384: times 4 dd 16384
pb_bswap32: db 3 , 2 , 1 , 0 , 7 , 6 , 5 , 4 , 11 , 10 , 9 , 8 , 15 , 14 , 13 , 12
SECTION _TEXT
@ -210,134 +208,6 @@ SCALARPRODUCT_LOOP 0
RET
;-----------------------------------------------------------------------------
; void ff_apply_window_int16(int16_t *output, const int16_t *input,
; const int16_t *window, unsigned int len)
;-----------------------------------------------------------------------------
% macro REVERSE_WORDS 1-2
% if cpuflag(ssse3) && notcpuflag(atom)
pshufb % 1 , % 2
% elif cpuflag(sse2)
pshuflw % 1 , % 1 , 0x1B
pshufhw % 1 , % 1 , 0x1B
pshufd % 1 , % 1 , 0x4E
% elif cpuflag(mmxext)
pshufw % 1 , % 1 , 0x1B
% endif
% endmacro
% macro MUL16FIXED 3
% if cpuflag(ssse3) ; dst, src, unused
; dst = ((dst * src) + (1<<14)) >> 15
pmulhrsw % 1 , % 2
% elif cpuflag(mmxext) ; dst, src, temp
; dst = (dst * src) >> 15
; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
; in from the pmullw result.
mova % 3 , % 1
pmulhw % 1 , % 2
pmullw % 3 , % 2
psrlw % 3 , 15
psllw % 1 , 1
por % 1 , % 3
% endif
% endmacro
% macro APPLY_WINDOW_INT16 1 ; %1 bitexact version
% if %1
cglobal apply_window_int16 , 4 , 5 , 6 , output , input , window , offset , offset2
% else
cglobal apply_window_int16_round , 4 , 5 , 6 , output , input , window , offset , offset2
% endif
lea offset2q , [ offsetq - mmsize ]
% if cpuflag(ssse3) && notcpuflag(atom)
mova m5 , [ pb_revwords ]
ALIGN 16
% elif %1
mova m5 , [ pd_16384 ]
% endif
.loop:
% if cpuflag(ssse3)
; This version does the 16x16->16 multiplication in-place without expanding
; to 32-bit. The ssse3 version is bit-identical.
mova m0 , [ windowq + offset2q ]
mova m1 , [ inputq + offset2q ]
pmulhrsw m1 , m0
REVERSE_WORDS m0 , m5
pmulhrsw m0 , [ inputq + offsetq ]
mova [ outputq + offset2q ], m1
mova [ outputq + offsetq ], m0
% elif %1
; This version expands 16-bit to 32-bit, multiplies by the window,
; adds 16384 for rounding, right shifts 15, then repacks back to words to
; save to the output. The window is reversed for the second half.
mova m3 , [ windowq + offset2q ]
mova m4 , [ inputq + offset2q ]
pxor m0 , m0
punpcklwd m0 , m3
punpcklwd m1 , m4
pmaddwd m0 , m1
paddd m0 , m5
psrad m0 , 15
pxor m2 , m2
punpckhwd m2 , m3
punpckhwd m1 , m4
pmaddwd m2 , m1
paddd m2 , m5
psrad m2 , 15
packssdw m0 , m2
mova [ outputq + offset2q ], m0
REVERSE_WORDS m3
mova m4 , [ inputq + offsetq ]
pxor m0 , m0
punpcklwd m0 , m3
punpcklwd m1 , m4
pmaddwd m0 , m1
paddd m0 , m5
psrad m0 , 15
pxor m2 , m2
punpckhwd m2 , m3
punpckhwd m1 , m4
pmaddwd m2 , m1
paddd m2 , m5
psrad m2 , 15
packssdw m0 , m2
mova [ outputq + offsetq ], m0
% else
; This version does the 16x16->16 multiplication in-place without expanding
; to 32-bit. The mmxext and sse2 versions do not use rounding, and
; therefore are not bit-identical to the C version.
mova m0 , [ windowq + offset2q ]
mova m1 , [ inputq + offset2q ]
mova m2 , [ inputq + offsetq ]
MUL16FIXED m1 , m0 , m3
REVERSE_WORDS m0
MUL16FIXED m2 , m0 , m3
mova [ outputq + offset2q ], m1
mova [ outputq + offsetq ], m2
% endif
add offsetd , mmsize
sub offset2d , mmsize
jae .loop
REP_RET
% endmacro
INIT_MMX mmxext
APPLY_WINDOW_INT16 0
INIT_XMM ss e2
APPLY_WINDOW_INT16 0
INIT_MMX mmxext
APPLY_WINDOW_INT16 1
INIT_XMM ss e2
APPLY_WINDOW_INT16 1
INIT_XMM ss se3
APPLY_WINDOW_INT16 1
INIT_XMM ss se3 , atom
APPLY_WINDOW_INT16 1
; void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
; const uint8_t *diff, int w,
; int *left, int *left_top)