@ -775,7 +775,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
% if cpuflag(avx)
% assign avx_enabled 1
% endif
% if mmsize == 16 && notcpuflag(sse2)
% if ( mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2) )
% define mova movaps
% define movu movups
% define movnta movntps
@ -869,10 +869,10 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
% if ARCH_X86_64
% define num_mmregs 16
% endif
% define mova v movaps
% define movu v movups
% define mova movdq a
% define movu movdq u
% undef movh
% define movnta vmovntps
% define movnta movntdq
% assign %%i 0
% rep num_mmregs
CAT_XDEFINE m , %% i , ymm %+ %% i
@ -1053,101 +1053,107 @@ INIT_XMM
;%1 == instruction
;%2 == 1 if float, 0 if int
;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
;%4 == number of operands given
;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
;%5+: operands
% macro RUN_AVX_INSTR 6-7 +
% ifid %6
% define %%sizeofreg sizeof%6
% elifid %5
% define %%sizeofreg sizeof%5
% macro RUN_AVX_INSTR 5-8 +
% ifnum sizeof %6
% assign %%sizeofreg sizeof%6
% elifnum sizeof %5
% assign %%sizeofreg sizeof%5
% else
% define %%sizeofreg mmsize
% assign %%sizeofreg mmsize
% endif
% if %%sizeofreg==32
% if %4>=3
v % 1 % 5 , % 6 , % 7
% else
v % 1 % 5 , % 6
% endif
% assign %%emulate_avx 0
% if avx_enabled && %%sizeofreg >= 16
% xdefine %%instr v%1
% else
% if %%sizeofreg==8
% define %%regmov movq
% elif %2
% define %%regmov movaps
% else
% define %%regmov movdqa
% xdefine %%instr %1
% if %0 >= 7+%3
% assign %%emulate_avx 1
% endif
% endif
% if %4>=3+%3
% ifnidn %5, %6
% if avx_enabled && %%sizeofreg==16
v % 1 % 5 , % 6 , % 7
% else
CHECK_AVX_INSTR_EMU { % 1 % 5 , % 6 , % 7 } , % 5 , % 7
% %regmov %5, %6
% 1 %5, %7
% if %%emulate_avx
% xdefine %%src1 %6
% xdefine %%src2 %7
% ifnidn %5, %6
% if %0 >= 8
CHECK_AVX_INSTR_EMU { % 1 % 5 , % 6 , % 7 , % 8 } , % 5 , % 7 , % 8
% else
CHECK_AVX_INSTR_EMU { % 1 % 5 , % 6 , % 7 } , % 5 , % 7
% endif
% if %4 && %3 == 0
% ifnid %7
; 3-operand AVX instructions with a memory arg can only have it in src2,
; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
; So, if the instruction is commutative with a memory arg, swap them.
% xdefine %%src1 %7
% xdefine %%src2 %6
% endif
% endif
% if %%sizeofreg == 8
MOVQ % 5 , %% src1
% elif %2
MOVAPS % 5 , %% src1
% else
% 1 %5, %7
MOVDQA % 5 , %% src1
% endif
% elif %4>=3
% 1 %5, %6, %7
% else
% 1 %5, %6
% endif
% endif
% endmacro
; 3arg AVX ops with a memory arg can only have it in src2,
; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).
; So, if the op is symmetric and the wrong one is memory, swap them.
% macro RUN_AVX_INSTR1 8
% assign %%swap 0
% if avx_enabled
% ifnid %6
% assign %%swap 1
% endif
% elifnidn %5, %6
% ifnid %7
% assign %%swap 1
% if %0 >= 8
% 1 %5, %%src2, %8
% else
% 1 %5, %%src2
% endif
% endif
% if %%swap && %3 == 0 && %8 == 1
RUN_AVX_INSTR % 1 , % 2 , % 3 , % 4 , % 5 , % 7 , % 6
% elif %0 >= 8
% %instr %5, %6, %7, %8
% elif %0 == 7
% %instr %5, %6, %7
% elif %0 == 6
% %instr %5, %6
% else
RUN_AVX_INSTR % 1 , % 2 , % 3 , % 4 , % 5 , % 6 , % 7
% %instr %5
% endif
% endmacro
;%1 == instruction
;%2 == 1 if float, 0 if int
;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not
% macro AVX_INSTR 4
% macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4
% ifidn %3, fnord
RUN_AVX_INSTR % 6 , % 7 , % 8 , 2 , % 1 , % 2
;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
% macro AVX_INSTR 1-4 0, 1, 0
% macro %1 1-9 fnord, fnord, fnord, fnord, %1, %2, %3, %4
% ifidn %2, fnord
RUN_AVX_INSTR % 6 , % 7 , % 8 , % 9 , % 1
% elifidn %3, fnord
RUN_AVX_INSTR % 6 , % 7 , % 8 , % 9 , % 1 , % 2
% elifidn %4, fnord
RUN_AVX_INSTR1 % 6 , % 7 , % 8 , 3 , % 1 , % 2 , % 3 , % 9
RUN_AVX_INSTR % 6 , % 7 , % 8 , % 9 , % 1 , % 2 , % 3
% elifidn %5, fnord
RUN_AVX_INSTR % 6 , % 7 , % 8 , 4 , % 1 , % 2 , % 3 , % 4
RUN_AVX_INSTR % 6 , % 7 , % 8 , % 9 , % 1 , % 2 , % 3 , % 4
% else
RUN_AVX_INSTR % 6 , % 7 , % 8 , 5 , % 1 , % 2 , % 3 , % 4 , % 5
RUN_AVX_INSTR % 6 , % 7 , % 8 , % 9 , % 1 , % 2 , % 3 , % 4 , % 5
% endif
% endmacro
% endmacro
; Instructions with both VEX and non-VEX encodings
; Non-destructive instructions are written without parameters
AVX_INSTR addpd , 1 , 0 , 1
AVX_INSTR addps , 1 , 0 , 1
AVX_INSTR addsd , 1 , 0 , 1
AVX_INSTR addss , 1 , 0 , 1
AVX_INSTR addsubpd , 1 , 0 , 0
AVX_INSTR addsubps , 1 , 0 , 0
AVX_INSTR andpd , 1 , 0 , 1
AVX_INSTR andps , 1 , 0 , 1
AVX_INSTR aesdec , 0 , 0 , 0
AVX_INSTR aesdeclast , 0 , 0 , 0
AVX_INSTR aesenc , 0 , 0 , 0
AVX_INSTR aesenclast , 0 , 0 , 0
AVX_INSTR aesimc
AVX_INSTR aeskeygenassist
AVX_INSTR andnpd , 1 , 0 , 0
AVX_INSTR andnps , 1 , 0 , 0
AVX_INSTR andpd , 1 , 0 , 1
AVX_INSTR andps , 1 , 0 , 1
AVX_INSTR bl endpd , 1 , 0 , 0
AVX_INSTR bl endps , 1 , 0 , 0
AVX_INSTR bl endvpd , 1 , 0 , 0
@ -1156,19 +1162,39 @@ AVX_INSTR cmppd, 1, 1, 0
AVX_INSTR cmpps , 1 , 1 , 0
AVX_INSTR cmpsd , 1 , 1 , 0
AVX_INSTR cmpss , 1 , 1 , 0
AVX_INSTR cvtdq2ps , 1 , 0 , 0
AVX_INSTR cvtpd2dq , 1 , 0 , 0
AVX_INSTR cvtps2dq , 1 , 0 , 0
AVX_INSTR comisd
AVX_INSTR comiss
AVX_INSTR cvtdq2pd
AVX_INSTR cvtdq2ps
AVX_INSTR cvtpd2dq
AVX_INSTR cvtpd2ps
AVX_INSTR cvtps2dq
AVX_INSTR cvtps2pd
AVX_INSTR cvtsd2si
AVX_INSTR cvtsd2ss
AVX_INSTR cvtsi2sd
AVX_INSTR cvtsi2ss
AVX_INSTR cvtss2sd
AVX_INSTR cvtss2si
AVX_INSTR cvttpd2dq
AVX_INSTR cvttps2dq
AVX_INSTR cvttsd2si
AVX_INSTR cvttss2si
AVX_INSTR di vpd , 1 , 0 , 0
AVX_INSTR di vps , 1 , 0 , 0
AVX_INSTR di vsd , 1 , 0 , 0
AVX_INSTR di vss , 1 , 0 , 0
AVX_INSTR dppd , 1 , 1 , 0
AVX_INSTR dpps , 1 , 1 , 0
AVX_INSTR extractps
AVX_INSTR haddpd , 1 , 0 , 0
AVX_INSTR haddps , 1 , 0 , 0
AVX_INSTR hsubpd , 1 , 0 , 0
AVX_INSTR hsubps , 1 , 0 , 0
AVX_INSTR insertps , 1 , 1 , 0
AVX_INSTR lddqu
AVX_INSTR ldmxcsr
AVX_INSTR maskmovdqu
AVX_INSTR maxpd , 1 , 0 , 1
AVX_INSTR maxps , 1 , 0 , 1
AVX_INSTR maxsd , 1 , 0 , 1
@ -1177,10 +1203,31 @@ AVX_INSTR minpd, 1, 0, 1
AVX_INSTR minps , 1 , 0 , 1
AVX_INSTR minsd , 1 , 0 , 1
AVX_INSTR minss , 1 , 0 , 1
AVX_INSTR movapd
AVX_INSTR movaps
AVX_INSTR movd
AVX_INSTR movddup
AVX_INSTR movdqa
AVX_INSTR movdqu
AVX_INSTR movhlps , 1 , 0 , 0
AVX_INSTR movhpd , 1 , 0 , 0
AVX_INSTR movhps , 1 , 0 , 0
AVX_INSTR movlhps , 1 , 0 , 0
AVX_INSTR movlpd , 1 , 0 , 0
AVX_INSTR movlps , 1 , 0 , 0
AVX_INSTR movmskpd
AVX_INSTR movmskps
AVX_INSTR movntdq
AVX_INSTR movntdqa
AVX_INSTR movntpd
AVX_INSTR movntps
AVX_INSTR movq
AVX_INSTR movsd , 1 , 0 , 0
AVX_INSTR movshdup
AVX_INSTR movsldup
AVX_INSTR movss , 1 , 0 , 0
AVX_INSTR movupd
AVX_INSTR movups
AVX_INSTR mpsadbw , 0 , 1 , 0
AVX_INSTR mulpd , 1 , 0 , 1
AVX_INSTR mulps , 1 , 0 , 1
@ -1188,9 +1235,9 @@ AVX_INSTR mulsd, 1, 0, 1
AVX_INSTR mulss , 1 , 0 , 1
AVX_INSTR orpd , 1 , 0 , 1
AVX_INSTR orps , 1 , 0 , 1
AVX_INSTR pabsb , 0 , 0 , 0
AVX_INSTR pabsw , 0 , 0 , 0
AVX_INSTR pabsd , 0 , 0 , 0
AVX_INSTR pabsb
AVX_INSTR pabsd
AVX_INSTR pabsw
AVX_INSTR packsswb , 0 , 0 , 0
AVX_INSTR packssdw , 0 , 0 , 0
AVX_INSTR packuswb , 0 , 0 , 0
@ -1210,10 +1257,11 @@ AVX_INSTR pavgb, 0, 0, 1
AVX_INSTR pavgw , 0 , 0 , 1
AVX_INSTR pblendvb , 0 , 0 , 0
AVX_INSTR pblendw , 0 , 1 , 0
AVX_INSTR pcmpestri , 0 , 0 , 0
AVX_INSTR pcmpestrm , 0 , 0 , 0
AVX_INSTR pcmpistri , 0 , 0 , 0
AVX_INSTR pcmpistrm , 0 , 0 , 0
AVX_INSTR pclmulqdq , 0 , 1 , 0
AVX_INSTR pcmpestri
AVX_INSTR pcmpestrm
AVX_INSTR pcmpistri
AVX_INSTR pcmpistrm
AVX_INSTR pcmpeqb , 0 , 0 , 1
AVX_INSTR pcmpeqw , 0 , 0 , 1
AVX_INSTR pcmpeqd , 0 , 0 , 1
@ -1222,12 +1270,21 @@ AVX_INSTR pcmpgtb, 0, 0, 0
AVX_INSTR pcmpgtw , 0 , 0 , 0
AVX_INSTR pcmpgtd , 0 , 0 , 0
AVX_INSTR pcmpgtq , 0 , 0 , 0
AVX_INSTR pextrb
AVX_INSTR pextrd
AVX_INSTR pextrq
AVX_INSTR pextrw
AVX_INSTR phaddw , 0 , 0 , 0
AVX_INSTR phaddd , 0 , 0 , 0
AVX_INSTR phaddsw , 0 , 0 , 0
AVX_INSTR phminposuw
AVX_INSTR phsubw , 0 , 0 , 0
AVX_INSTR phsubd , 0 , 0 , 0
AVX_INSTR phsubsw , 0 , 0 , 0
AVX_INSTR pinsrb , 0 , 1 , 0
AVX_INSTR pinsrd , 0 , 1 , 0
AVX_INSTR pinsrq , 0 , 1 , 0
AVX_INSTR pinsrw , 0 , 1 , 0
AVX_INSTR pmaddwd , 0 , 0 , 1
AVX_INSTR pmaddubsw , 0 , 0 , 0
AVX_INSTR pmaxsb , 0 , 0 , 1
@ -1242,20 +1299,32 @@ AVX_INSTR pminsd, 0, 0, 1
AVX_INSTR pminub , 0 , 0 , 1
AVX_INSTR pminuw , 0 , 0 , 1
AVX_INSTR pminud , 0 , 0 , 1
AVX_INSTR pmovmskb , 0 , 0 , 0
AVX_INSTR pmulhuw , 0 , 0 , 1
AVX_INSTR pmovmskb
AVX_INSTR pmovsxbw
AVX_INSTR pmovsxbd
AVX_INSTR pmovsxbq
AVX_INSTR pmovsxwd
AVX_INSTR pmovsxwq
AVX_INSTR pmovsxdq
AVX_INSTR pmovzxbw
AVX_INSTR pmovzxbd
AVX_INSTR pmovzxbq
AVX_INSTR pmovzxwd
AVX_INSTR pmovzxwq
AVX_INSTR pmovzxdq
AVX_INSTR pmuldq , 0 , 0 , 1
AVX_INSTR pmulhrsw , 0 , 0 , 1
AVX_INSTR pmulhuw , 0 , 0 , 1
AVX_INSTR pmulhw , 0 , 0 , 1
AVX_INSTR pmullw , 0 , 0 , 1
AVX_INSTR pmulld , 0 , 0 , 1
AVX_INSTR pmuludq , 0 , 0 , 1
AVX_INSTR pmuldq , 0 , 0 , 1
AVX_INSTR por , 0 , 0 , 1
AVX_INSTR psadbw , 0 , 0 , 1
AVX_INSTR pshufb , 0 , 0 , 0
AVX_INSTR pshufd , 0 , 1 , 0
AVX_INSTR pshufhw , 0 , 1 , 0
AVX_INSTR pshuflw , 0 , 1 , 0
AVX_INSTR pshufd
AVX_INSTR pshufhw
AVX_INSTR pshuflw
AVX_INSTR psignb , 0 , 0 , 0
AVX_INSTR psignw , 0 , 0 , 0
AVX_INSTR psignd , 0 , 0 , 0
@ -1277,7 +1346,7 @@ AVX_INSTR psubsb, 0, 0, 0
AVX_INSTR psubsw , 0 , 0 , 0
AVX_INSTR psubusb , 0 , 0 , 0
AVX_INSTR psubusw , 0 , 0 , 0
AVX_INSTR ptest , 0 , 0 , 0
AVX_INSTR ptest
AVX_INSTR punpckhbw , 0 , 0 , 0
AVX_INSTR punpckhwd , 0 , 0 , 0
AVX_INSTR punpckhdq , 0 , 0 , 0
@ -1287,11 +1356,27 @@ AVX_INSTR punpcklwd, 0, 0, 0
AVX_INSTR punpckldq , 0 , 0 , 0
AVX_INSTR punpcklqdq , 0 , 0 , 0
AVX_INSTR pxor , 0 , 0 , 1
AVX_INSTR rcpps , 1 , 0 , 0
AVX_INSTR rcpss , 1 , 0 , 0
AVX_INSTR roundpd
AVX_INSTR roundps
AVX_INSTR roundsd
AVX_INSTR roundss
AVX_INSTR rsqrtps , 1 , 0 , 0
AVX_INSTR rsqrtss , 1 , 0 , 0
AVX_INSTR shufpd , 1 , 1 , 0
AVX_INSTR shufps , 1 , 1 , 0
AVX_INSTR sqrtpd , 1 , 0 , 0
AVX_INSTR sqrtps , 1 , 0 , 0
AVX_INSTR sqrtsd , 1 , 0 , 0
AVX_INSTR sqrtss , 1 , 0 , 0
AVX_INSTR stmxcsr
AVX_INSTR subpd , 1 , 0 , 0
AVX_INSTR subps , 1 , 0 , 0
AVX_INSTR subsd , 1 , 0 , 0
AVX_INSTR subss , 1 , 0 , 0
AVX_INSTR ucomisd
AVX_INSTR ucomiss
AVX_INSTR unpckhpd , 1 , 0 , 0
AVX_INSTR unpckhps , 1 , 0 , 0
AVX_INSTR unpcklpd , 1 , 0 , 0