@ -70,8 +70,8 @@ SECTION .text
packuswb m0 , m1
packuswb m0 , m1
% endmacro
% endmacro
INIT_MMX
INIT_MMX mmxext
cglobal h264_weight_16_mmxext , 6 , 6 , 0
cglobal h264_weight_16 , 6 , 6 , 0
WEIGHT_SETUP
WEIGHT_SETUP
.nextrow:
.nextrow:
WEIGHT_OP 0 , 4
WEIGHT_OP 0 , 4
@ -83,8 +83,8 @@ cglobal h264_weight_16_mmxext, 6, 6, 0
jnz .nextrow
jnz .nextrow
REP_RET
REP_RET
% macro WEIGHT_FUNC_MM 3
% macro WEIGHT_FUNC_MM 2
cglobal h264_weight_ % 1 _ % 3 , 6 , 6 , % 2
cglobal h264_weight_ % 1 , 6 , 6 , % 2
WEIGHT_SETUP
WEIGHT_SETUP
.nextrow:
.nextrow:
WEIGHT_OP 0 , mmsize / 2
WEIGHT_OP 0 , mmsize / 2
@ -95,13 +95,13 @@ cglobal h264_weight_%1_%3, 6, 6, %2
REP_RET
REP_RET
% endmacro
% endmacro
INIT_MMX
INIT_MMX mmxext
WEIGHT_FUNC_MM 8 , 0 , mmxext
WEIGHT_FUNC_MM 8 , 0
INIT_XMM
INIT_XMM ss e2
WEIGHT_FUNC_MM 16 , 8 , ss e2
WEIGHT_FUNC_MM 16 , 8
% macro WEIGHT_FUNC_HALF_MM 3
% macro WEIGHT_FUNC_HALF_MM 2
cglobal h264_weight_ % 1 _ % 3 , 6 , 6 , % 2
cglobal h264_weight_ % 1 , 6 , 6 , % 2
WEIGHT_SETUP
WEIGHT_SETUP
sar r2d , 1
sar r2d , 1
lea r3 , [ r1 * 2 ]
lea r3 , [ r1 * 2 ]
@ -120,10 +120,10 @@ cglobal h264_weight_%1_%3, 6, 6, %2
REP_RET
REP_RET
% endmacro
% endmacro
INIT_MMX
INIT_MMX mmxext
WEIGHT_FUNC_HALF_MM 4 , 0 , mmxext
WEIGHT_FUNC_HALF_MM 4 , 0
INIT_XMM
INIT_XMM ss e2
WEIGHT_FUNC_HALF_MM 8 , 8 , ss e2
WEIGHT_FUNC_HALF_MM 8 , 8
% macro BIWEIGHT_SETUP 0
% macro BIWEIGHT_SETUP 0
% if ARCH_X86_64
% if ARCH_X86_64
@ -135,12 +135,32 @@ WEIGHT_FUNC_HALF_MM 8, 8, sse2
add off_regd , 1
add off_regd , 1
or off_regd , 1
or off_regd , 1
add r4 , 1
add r4 , 1
cmp r5 , 128
jne .normal
sar r5 , 1
sar r6 , 1
sar off_regd , 1
sub r4 , 1
.normal
% if cpuflag(ssse3)
movd m4 , r5d
movd m0 , r6d
% else
movd m3 , r5d
movd m3 , r5d
movd m4 , r6d
movd m4 , r6d
% endif
movd m5 , off_regd
movd m5 , off_regd
movd m6 , r4d
movd m6 , r4d
pslld m5 , m6
pslld m5 , m6
psrld m5 , 1
psrld m5 , 1
% if cpuflag(ssse3)
punpcklbw m4 , m0
pshuflw m4 , m4 , 0
pshuflw m5 , m5 , 0
punpcklqdq m4 , m4
punpcklqdq m5 , m5
% else
% if mmsize == 16
% if mmsize == 16
pshuflw m3 , m3 , 0
pshuflw m3 , m3 , 0
pshuflw m4 , m4 , 0
pshuflw m4 , m4 , 0
@ -154,6 +174,7 @@ WEIGHT_FUNC_HALF_MM 8, 8, sse2
pshufw m5 , m5 , 0
pshufw m5 , m5 , 0
% endif
% endif
pxor m7 , m7
pxor m7 , m7
% endif
% endmacro
% endmacro
% macro BIWEIGHT_STEPA 3
% macro BIWEIGHT_STEPA 3
@ -174,8 +195,8 @@ WEIGHT_FUNC_HALF_MM 8, 8, sse2
packuswb m0 , m1
packuswb m0 , m1
% endmacro
% endmacro
INIT_MMX
INIT_MMX mmxext
cglobal h264_biweight_16_mmxext , 7 , 8 , 0
cglobal h264_biweight_16 , 7 , 8 , 0
BIWEIGHT_SETUP
BIWEIGHT_SETUP
movifnidn r3d , r3m
movifnidn r3d , r3m
.nextrow:
.nextrow:
@ -193,8 +214,8 @@ cglobal h264_biweight_16_mmxext, 7, 8, 0
jnz .nextrow
jnz .nextrow
REP_RET
REP_RET
% macro BIWEIGHT_FUNC_MM 3
% macro BIWEIGHT_FUNC_MM 2
cglobal h264_biweight_ % 1 _ % 3 , 7 , 8 , % 2
cglobal h264_biweight_ % 1 , 7 , 8 , % 2
BIWEIGHT_SETUP
BIWEIGHT_SETUP
movifnidn r3d , r3m
movifnidn r3d , r3m
.nextrow:
.nextrow:
@ -209,13 +230,13 @@ cglobal h264_biweight_%1_%3, 7, 8, %2
REP_RET
REP_RET
% endmacro
% endmacro
INIT_MMX
INIT_MMX mmxext
BIWEIGHT_FUNC_MM 8 , 0 , mmxext
BIWEIGHT_FUNC_MM 8 , 0
INIT_XMM
INIT_XMM ss e2
BIWEIGHT_FUNC_MM 16 , 8 , ss e2
BIWEIGHT_FUNC_MM 16 , 8
% macro BIWEIGHT_FUNC_HALF_MM 3
% macro BIWEIGHT_FUNC_HALF_MM 2
cglobal h264_biweight_ % 1 _ % 3 , 7 , 8 , % 2
cglobal h264_biweight_ % 1 , 7 , 8 , % 2
BIWEIGHT_SETUP
BIWEIGHT_SETUP
movifnidn r3d , r3m
movifnidn r3d , r3m
sar r3 , 1
sar r3 , 1
@ -238,40 +259,10 @@ cglobal h264_biweight_%1_%3, 7, 8, %2
REP_RET
REP_RET
% endmacro
% endmacro
INIT_MMX
INIT_MMX mmxext
BIWEIGHT_FUNC_HALF_MM 4 , 0 , mmxext
BIWEIGHT_FUNC_HALF_MM 4 , 0
INIT_XMM
INIT_XMM ss e2
BIWEIGHT_FUNC_HALF_MM 8 , 8 , ss e2
BIWEIGHT_FUNC_HALF_MM 8 , 8
% macro BIWEIGHT_SSSE3_SETUP 0
% if ARCH_X86_64
% define off_regd r7d
% else
% define off_regd r3d
% endif
mov off_regd , r7m
add off_regd , 1
or off_regd , 1
add r4 , 1
cmp r5 , 128
jne .normal
sar r5 , 1
sar r6 , 1
sar off_regd , 1
sub r4 , 1
.normal
movd m4 , r5d
movd m0 , r6d
movd m5 , off_regd
movd m6 , r4d
pslld m5 , m6
psrld m5 , 1
punpcklbw m4 , m0
pshuflw m4 , m4 , 0
pshuflw m5 , m5 , 0
punpcklqdq m4 , m4
punpcklqdq m5 , m5
% endmacro
% macro BIWEIGHT_SSSE3_OP 0
% macro BIWEIGHT_SSSE3_OP 0
pmaddubsw m0 , m4
pmaddubsw m0 , m4
@ -283,9 +274,9 @@ BIWEIGHT_FUNC_HALF_MM 8, 8, sse2
packuswb m0 , m2
packuswb m0 , m2
% endmacro
% endmacro
INIT_XMM
INIT_XMM ss se3
cglobal h264_biweight_16_ssse3 , 7 , 8 , 8
cglobal h264_biweight_16 , 7 , 8 , 8
BIWEIGHT_SSSE3_S ETUP
BIWEIGHT_SETUP
movifnidn r3d , r3m
movifnidn r3d , r3m
.nextrow:
.nextrow:
@ -302,9 +293,9 @@ cglobal h264_biweight_16_ssse3, 7, 8, 8
jnz .nextrow
jnz .nextrow
REP_RET
REP_RET
INIT_XMM
INIT_XMM ss se3
cglobal h264_biweight_8_ssse3 , 7 , 8 , 8
cglobal h264_biweight_8 , 7 , 8 , 8
BIWEIGHT_SSSE3_S ETUP
BIWEIGHT_SETUP
movifnidn r3d , r3m
movifnidn r3d , r3m
sar r3 , 1
sar r3 , 1
lea r4 , [ r2 * 2 ]
lea r4 , [ r2 * 2 ]