|
|
|
/*
|
|
|
|
* copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
|
|
|
|
*
|
|
|
|
* This file is part of FFmpeg.
|
|
|
|
*
|
|
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
|
|
* License as published by the Free Software Foundation; either
|
|
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
|
|
*
|
|
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* Lesser General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef AVUTIL_X86_ASM_H
|
|
|
|
#define AVUTIL_X86_ASM_H
|
|
|
|
|
|
|
|
#include <stdint.h>
|
|
|
|
#include "config.h"
|
|
|
|
|
|
|
|
typedef struct xmm_reg { uint64_t a, b; } xmm_reg;
|
x86/vp9: inital AVX2 intra_pred
tos3k-vp9-b10000.webm on a Core i5-4200U @1.6GHz
1219 decicycles in ff_vp9_ipred_dc_32x32_ssse3, 131070 runs, 2 skips
439 decicycles in ff_vp9_ipred_dc_32x32_avx2, 131070 runs, 2 skips
3570 decicycles in ff_vp9_ipred_dc_top_32x32_ssse3, 4096 runs, 0 skips
2494 decicycles in ff_vp9_ipred_dc_top_32x32_avx2, 4096 runs, 0 skips
1419 decicycles in ff_vp9_ipred_dc_left_32x32_ssse3, 16384 runs, 0 skips
717 decicycles in ff_vp9_ipred_dc_left_32x32_avx2, 16384 runs, 0 skips
2737 decicycles in ff_vp9_ipred_tm_32x32_avx, 1024 runs, 0 skips
2088 decicycles in ff_vp9_ipred_tm_32x32_avx2, 1024 runs, 0 skips
3090 decicycles in ff_vp9_ipred_v_32x32_avx, 512 runs, 0 skips
2226 decicycles in ff_vp9_ipred_v_32x32_avx2, 512 runs, 0 skips
1565 decicycles in ff_vp9_ipred_h_32x32_avx, 1024 runs, 0 skips
922 decicycles in ff_vp9_ipred_h_32x32_avx2, 1024 runs, 0 skips
Signed-off-by: James Almer <jamrial@gmail.com>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
11 years ago
|
|
|
typedef struct ymm_reg { uint64_t a, b, c, d; } ymm_reg;
|
|
|
|
|
|
|
|
#if ARCH_X86_64
|
|
|
|
# define OPSIZE "q"
|
|
|
|
# define REG_a "rax"
|
|
|
|
# define REG_b "rbx"
|
|
|
|
# define REG_c "rcx"
|
|
|
|
# define REG_d "rdx"
|
|
|
|
# define REG_D "rdi"
|
|
|
|
# define REG_S "rsi"
|
|
|
|
# define PTR_SIZE "8"
|
|
|
|
typedef int64_t x86_reg;
|
|
|
|
|
|
|
|
/* REG_SP is defined in Solaris sys headers, so use REG_sp */
|
|
|
|
# define REG_sp "rsp"
|
|
|
|
# define REG_BP "rbp"
|
|
|
|
# define REGBP rbp
|
|
|
|
# define REGa rax
|
|
|
|
# define REGb rbx
|
|
|
|
# define REGc rcx
|
|
|
|
# define REGd rdx
|
|
|
|
# define REGSP rsp
|
|
|
|
|
|
|
|
#elif ARCH_X86_32
|
|
|
|
|
|
|
|
# define OPSIZE "l"
|
|
|
|
# define REG_a "eax"
|
|
|
|
# define REG_b "ebx"
|
|
|
|
# define REG_c "ecx"
|
|
|
|
# define REG_d "edx"
|
|
|
|
# define REG_D "edi"
|
|
|
|
# define REG_S "esi"
|
|
|
|
# define PTR_SIZE "4"
|
|
|
|
typedef int32_t x86_reg;
|
|
|
|
|
|
|
|
# define REG_sp "esp"
|
|
|
|
# define REG_BP "ebp"
|
|
|
|
# define REGBP ebp
|
|
|
|
# define REGa eax
|
|
|
|
# define REGb ebx
|
|
|
|
# define REGc ecx
|
|
|
|
# define REGd edx
|
|
|
|
# define REGSP esp
|
|
|
|
#else
|
|
|
|
typedef int x86_reg;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define HAVE_7REGS (ARCH_X86_64 || (HAVE_EBX_AVAILABLE && HAVE_EBP_AVAILABLE))
|
|
|
|
#define HAVE_6REGS (ARCH_X86_64 || (HAVE_EBX_AVAILABLE || HAVE_EBP_AVAILABLE))
|
|
|
|
|
|
|
|
#if ARCH_X86_64 && defined(PIC)
|
|
|
|
# define BROKEN_RELOCATIONS 1
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If gcc is not set to support sse (-msse) it will not accept xmm registers
|
|
|
|
* in the clobber list for inline asm. XMM_CLOBBERS takes a list of xmm
|
|
|
|
* registers to be marked as clobbered and evaluates to nothing if they are
|
|
|
|
* not supported, or to the list itself if they are supported. Since a clobber
|
|
|
|
* list may not be empty, XMM_CLOBBERS_ONLY should be used if the xmm
|
|
|
|
* registers are the only in the clobber list.
|
|
|
|
* For example a list with "eax" and "xmm0" as clobbers should become:
|
|
|
|
* : XMM_CLOBBERS("xmm0",) "eax"
|
|
|
|
* and a list with only "xmm0" should become:
|
|
|
|
* XMM_CLOBBERS_ONLY("xmm0")
|
|
|
|
*/
|
|
|
|
#if HAVE_XMM_CLOBBERS
|
|
|
|
# define XMM_CLOBBERS(...) __VA_ARGS__
|
|
|
|
# define XMM_CLOBBERS_ONLY(...) : __VA_ARGS__
|
|
|
|
#else
|
|
|
|
# define XMM_CLOBBERS(...)
|
|
|
|
# define XMM_CLOBBERS_ONLY(...)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* Use to export labels from asm. */
|
|
|
|
#define LABEL_MANGLE(a) EXTERN_PREFIX #a
|
|
|
|
|
|
|
|
// Use rip-relative addressing if compiling PIC code on x86-64.
|
|
|
|
#if ARCH_X86_64 && defined(PIC)
|
|
|
|
# define LOCAL_MANGLE(a) #a "(%%rip)"
|
|
|
|
#else
|
|
|
|
# define LOCAL_MANGLE(a) #a
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if HAVE_INLINE_ASM_DIRECT_SYMBOL_REFS
|
|
|
|
# define MANGLE(a) EXTERN_PREFIX LOCAL_MANGLE(a)
|
|
|
|
# define NAMED_CONSTRAINTS_ADD(...)
|
|
|
|
# define NAMED_CONSTRAINTS(...)
|
|
|
|
# define NAMED_CONSTRAINTS_ARRAY_ADD(...)
|
|
|
|
# define NAMED_CONSTRAINTS_ARRAY(...)
|
|
|
|
#else
|
|
|
|
/* When direct symbol references are used in code passed to a compiler that does not support them
|
|
|
|
* then these references need to be converted to named asm constraints instead.
|
|
|
|
* Instead of returning a direct symbol MANGLE now returns a named constraint for that specific symbol.
|
|
|
|
* In order for this to work there must also be a corresponding entry in the asm-interface. To add this
|
|
|
|
* entry use the macro NAMED_CONSTRAINTS() and pass in a list of each symbol reference used in the
|
|
|
|
* corresponding block of code. (e.g. NAMED_CONSTRAINTS(var1,var2,var3) where var1 is the first symbol etc. ).
|
|
|
|
* If there are already existing constraints then use NAMED_CONSTRAINTS_ADD to add to the existing constraint list.
|
|
|
|
*/
|
|
|
|
# define MANGLE(a) "%["#a"]"
|
|
|
|
// Intel/MSVC does not correctly expand va-args so we need a rather ugly hack in order to get it to work
|
|
|
|
# define FE_0(P,X) P(X)
|
|
|
|
# define FE_1(P,X,X1) P(X), FE_0(P,X1)
|
|
|
|
# define FE_2(P,X,X1,X2) P(X), FE_1(P,X1,X2)
|
|
|
|
# define FE_3(P,X,X1,X2,X3) P(X), FE_2(P,X1,X2,X3)
|
|
|
|
# define FE_4(P,X,X1,X2,X3,X4) P(X), FE_3(P,X1,X2,X3,X4)
|
|
|
|
# define FE_5(P,X,X1,X2,X3,X4,X5) P(X), FE_4(P,X1,X2,X3,X4,X5)
|
|
|
|
# define FE_6(P,X,X1,X2,X3,X4,X5,X6) P(X), FE_5(P,X1,X2,X3,X4,X5,X6)
|
|
|
|
# define FE_7(P,X,X1,X2,X3,X4,X5,X6,X7) P(X), FE_6(P,X1,X2,X3,X4,X5,X6,X7)
|
|
|
|
# define FE_8(P,X,X1,X2,X3,X4,X5,X6,X7,X8) P(X), FE_7(P,X1,X2,X3,X4,X5,X6,X7,X8)
|
|
|
|
# define FE_9(P,X,X1,X2,X3,X4,X5,X6,X7,X8,X9) P(X), FE_8(P,X1,X2,X3,X4,X5,X6,X7,X8,X9)
|
|
|
|
# define GET_FE_IMPL(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,NAME,...) NAME
|
|
|
|
# define GET_FE(A) GET_FE_IMPL A
|
|
|
|
# define GET_FE_GLUE(x, y) x y
|
|
|
|
# define FOR_EACH_VA(P,...) GET_FE_GLUE(GET_FE((__VA_ARGS__,FE_9,FE_8,FE_7,FE_6,FE_5,FE_4,FE_3,FE_2,FE_1,FE_0)), (P,__VA_ARGS__))
|
|
|
|
# define NAME_CONSTRAINT(x) [x] "m"(x)
|
|
|
|
// Parameters are a list of each symbol reference required
|
|
|
|
# define NAMED_CONSTRAINTS_ADD(...) , FOR_EACH_VA(NAME_CONSTRAINT,__VA_ARGS__)
|
|
|
|
// Same but without comma for when there are no previously defined constraints
|
|
|
|
# define NAMED_CONSTRAINTS(...) FOR_EACH_VA(NAME_CONSTRAINT,__VA_ARGS__)
|
|
|
|
// Same as above NAMED_CONSTRAINTS except used for passing arrays/pointers instead of normal variables
|
|
|
|
# define NAME_CONSTRAINT_ARRAY(x) [x] "m"(*x)
|
|
|
|
# define NAMED_CONSTRAINTS_ARRAY_ADD(...) , FOR_EACH_VA(NAME_CONSTRAINT_ARRAY,__VA_ARGS__)
|
|
|
|
# define NAMED_CONSTRAINTS_ARRAY(...) FOR_EACH_VA(NAME_CONSTRAINT_ARRAY,__VA_ARGS__)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif /* AVUTIL_X86_ASM_H */
|