From 2966cc18493d9dc041c229b9d05ee1c3217eb32f Mon Sep 17 00:00:00 2001 From: Jason Garrett-Glaser Date: Wed, 23 Jun 2010 19:20:46 +0000 Subject: [PATCH] Update x264asm header files to latest versions. Modify the asm accordingly. GLOBAL is now no longoer necessary for PIC-compliant loads. Originally committed as revision 23739 to svn://svn.ffmpeg.org/ffmpeg/trunk --- libavcodec/x86/dsputil_yasm.asm | 22 +++--- libavcodec/x86/fft_mmx.asm | 24 +++--- libavcodec/x86/h264_deblock_sse2.asm | 20 ++--- libavcodec/x86/h264_idct_sse2.asm | 2 +- libavcodec/x86/x86inc.asm | 113 +++++++++++++++------------ libavcodec/x86/x86util.asm | 69 +++++++++++++--- 6 files changed, 155 insertions(+), 95 deletions(-) diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm index e2478a4845..bda04727db 100644 --- a/libavcodec/x86/dsputil_yasm.asm +++ b/libavcodec/x86/dsputil_yasm.asm @@ -40,7 +40,7 @@ section .text align=16 %endmacro %macro FLOAT_TO_INT16_INTERLEAVE6 1 -; void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) +; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 %ifdef ARCH_X86_64 %define lend r10d @@ -272,7 +272,7 @@ SCALARPRODUCT_LOOP 0 -; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) +; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top movq mm0, [topq] movq mm2, mm0 @@ -370,23 +370,23 @@ cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_to RET %endmacro -; int ff_add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left) +; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left) INIT_MMX cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left .skip_prologue: - mova m5, [pb_7 GLOBAL] - mova m4, [pb_zzzz3333zzzzbbbb GLOBAL] - mova m3, [pb_zz11zz55zz99zzdd GLOBAL] + mova m5, [pb_7] + mova m4, [pb_zzzz3333zzzzbbbb] + mova m3, [pb_zz11zz55zz99zzdd] movd m0, leftm psllq m0, 56 ADD_HFYU_LEFT_LOOP 1 INIT_XMM cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left - mova m5, [pb_f GLOBAL] - mova m6, [pb_zzzzzzzz77777777 GLOBAL] - mova m4, [pb_zzzz3333zzzzbbbb GLOBAL] - mova m3, [pb_zz11zz55zz99zzdd GLOBAL] + mova m5, [pb_f] + mova m6, [pb_zzzzzzzz77777777] + mova m4, [pb_zzzz3333zzzzbbbb] + mova m3, [pb_zz11zz55zz99zzdd] movd m0, leftm pslldq m0, 15 test srcq, 15 @@ -398,7 +398,7 @@ cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left ADD_HFYU_LEFT_LOOP 0 -; float ff_scalarproduct_float_sse(const float *v1, const float *v2, int len) +; float scalarproduct_float_sse(const float *v1, const float *v2, int len) cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset neg offsetq shl offsetq, 2 diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm index 9cb0ae1bfe..ea78396e7c 100644 --- a/libavcodec/x86/fft_mmx.asm +++ b/libavcodec/x86/fft_mmx.asm @@ -35,7 +35,7 @@ ps_m1p1: dd 1<<31, 0 %assign i 16 %rep 13 -cextern ff_cos_ %+ i +cextern cos_ %+ i %assign i i<<1 %endrep @@ -64,7 +64,7 @@ section .text align=16 mova %5, %3 pfsub %3, %4 pfadd %5, %4 ; {t6,t5} - pxor %3, [ps_m1p1 GLOBAL] ; {t8,t7} + pxor %3, [ps_m1p1] ; {t8,t7} mova %6, %1 pswapd %3, %3 pfadd %1, %5 ; {r0,i0} @@ -105,8 +105,8 @@ section .text align=16 addps %6, %5 ; {t1,t2,t3,t4} mova %5, %3 shufps %5, %5, 0xb1 ; {i5,r5,i7,r7} - mulps %3, [ps_root2mppm GLOBAL] ; {-r5,i5,r7,-i7} - mulps %5, [ps_root2 GLOBAL] + mulps %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} + mulps %5, [ps_root2] addps %3, %5 ; {t8,t7,ta,t9} mova %5, %6 shufps %6, %3, 0x36 ; {t3,t2,t9,t8} @@ -309,7 +309,7 @@ fft16_sse: mova m6, Z(6) mova m7, Z(7) T4_SSE m6, m7, m0 - PASS_SMALL 0, [ff_cos_16 GLOBAL], [ff_cos_16+16 GLOBAL] + PASS_SMALL 0, [cos_16], [cos_16+16] ret @@ -342,12 +342,12 @@ fft8%1: T2_3DN m6, m7, Z(6), Z(7) pswapd m0, m5 pswapd m2, m7 - pxor m0, [ps_m1p1 GLOBAL] - pxor m2, [ps_m1p1 GLOBAL] + pxor m0, [ps_m1p1] + pxor m2, [ps_m1p1] pfsub m5, m0 pfadd m7, m2 - pfmul m5, [ps_root2 GLOBAL] - pfmul m7, [ps_root2 GLOBAL] + pfmul m5, [ps_root2] + pfmul m7, [ps_root2] T4_3DN m1, m3, m5, m7, m0, m2 mova Z(5), m5 mova Z(7), m7 @@ -445,7 +445,7 @@ fft %+ n %+ %3%2: add r0, n*2 - (n2&(-2<<%1)) call fft %+ n4 %+ %2 sub r0, n*6 + (n2&(-2<<%1)) - lea r1, [ff_cos_ %+ n GLOBAL] + lea r1, [cos_ %+ n] mov r2d, n4/2 jmp pass%3%2 @@ -461,10 +461,10 @@ section .text ; On x86_32, this function does the register saving and restoring for all of fft. ; The others pass args in registers and don't spill anything. cglobal fft_dispatch%3%2, 2,5,8, z, nbits - lea r2, [dispatch_tab%3%2 GLOBAL] + lea r2, [dispatch_tab%3%2] mov r2, [r2 + (nbitsq-2)*gprsize] %ifdef PIC - lea r3, [$$ GLOBAL] + lea r3, [$$] add r2, r3 %endif call r2 diff --git a/libavcodec/x86/h264_deblock_sse2.asm b/libavcodec/x86/h264_deblock_sse2.asm index bf45c7ea69..b2aa940236 100644 --- a/libavcodec/x86/h264_deblock_sse2.asm +++ b/libavcodec/x86/h264_deblock_sse2.asm @@ -234,18 +234,18 @@ SECTION .text %macro DEBLOCK_P0_Q0 0 mova m5, m1 pxor m5, m2 ; p0^q0 - pand m5, [pb_01 GLOBAL] ; (p0^q0)&1 + pand m5, [pb_01] ; (p0^q0)&1 pcmpeqb m4, m4 pxor m3, m4 pavgb m3, m0 ; (p1 - q1 + 256)>>1 - pavgb m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 + pavgb m3, [pb_03] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 pxor m4, m1 pavgb m4, m2 ; (q0 - p0 + 256)>>1 pavgb m3, m5 paddusb m3, m4 ; d+128+33 - mova m6, [pb_a1 GLOBAL] + mova m6, [pb_a1] psubusb m6, m3 - psubusb m3, [pb_a1 GLOBAL] + psubusb m3, [pb_a1] pminub m6, m7 pminub m3, m7 psubusb m1, m6 @@ -263,7 +263,7 @@ SECTION .text pavgb %6, m2 pavgb %2, %6 ; avg(p2,avg(p0,q0)) pxor %6, %3 - pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1 + pand %6, [pb_01] ; (p2^avg(p0,q0))&1 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 mova %6, %1 psubusb %6, %5 @@ -612,8 +612,8 @@ DEBLOCK_LUMA sse2, v, 16 %define mask0 spill(2) %define mask1p spill(3) %define mask1q spill(4) - %define mpb_00 [pb_00 GLOBAL] - %define mpb_01 [pb_01 GLOBAL] + %define mpb_00 [pb_00] + %define mpb_01 [pb_01] %endif ;----------------------------------------------------------------------------- @@ -637,7 +637,7 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 mova q1, [r0+r1] %ifdef ARCH_X86_64 pxor mpb_00, mpb_00 - mova mpb_01, [pb_01 GLOBAL] + mova mpb_01, [pb_01] LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 SWAP 7, 12 ; m12=mask0 pavgb t5, mpb_00 @@ -656,8 +656,8 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 mova m4, t5 mova mask0, m7 - pavgb m4, [pb_00 GLOBAL] - pavgb m4, [pb_01 GLOBAL] ; alpha/4+1 + pavgb m4, [pb_00] + pavgb m4, [pb_01] ; alpha/4+1 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 pand m6, mask0 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 diff --git a/libavcodec/x86/h264_idct_sse2.asm b/libavcodec/x86/h264_idct_sse2.asm index f8ee2b6288..86c1e66c72 100644 --- a/libavcodec/x86/h264_idct_sse2.asm +++ b/libavcodec/x86/h264_idct_sse2.asm @@ -43,7 +43,7 @@ cglobal x264_add8x4_idct_sse2, 3,3,8 movhps m3, [r1+56] IDCT4_1D 0,1,2,3,4,5 TRANSPOSE2x4x4W 0,1,2,3,4 - paddw m0, [pw_32 GLOBAL] + paddw m0, [pw_32] IDCT4_1D 0,1,2,3,4,5 pxor m7, m7 STORE_DIFF m0, m4, m7, [r0] diff --git a/libavcodec/x86/x86inc.asm b/libavcodec/x86/x86inc.asm index c29ef3ee34..410b11bb28 100644 --- a/libavcodec/x86/x86inc.asm +++ b/libavcodec/x86/x86inc.asm @@ -1,25 +1,39 @@ ;***************************************************************************** ;* x86inc.asm ;***************************************************************************** -;* Copyright (C) 2005-2008 Loren Merritt +;* Copyright (C) 2005-2008 x264 project ;* -;* This file is part of FFmpeg. +;* Authors: Loren Merritt +;* Anton Mitrofanov ;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. +;* Permission to use, copy, modify, and/or distribute this software for any +;* purpose with or without fee is hereby granted, provided that the above +;* copyright notice and this permission notice appear in all copies. ;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ;***************************************************************************** +; This is a header file for the x264ASM assembly language, which uses +; NASM/YASM syntax combined with a large number of macros to provide easy +; abstraction between different calling conventions (x86_32, win64, linux64). +; It also has various other useful features to simplify writing the kind of +; DSP functions that are most often used in x264. + +; Unlike the rest of x264, this file is available under an ISC license, as it +; has significant usefulness outside of x264 and we want it to be available +; to the largest audience possible. Of course, if you modify it for your own +; purposes to add a new feature, we strongly encourage contributing a patch +; as this feature might be useful for others as well. Send patches or ideas +; to x264-devel@videolan.org . + +%define program_name ff + %ifdef ARCH_X86_64 %ifidn __OUTPUT_FORMAT__,win32 %define WIN64 @@ -28,6 +42,12 @@ %endif %endif +%ifdef PREFIX + %define mangle(x) _ %+ x +%else + %define mangle(x) x +%endif + ; FIXME: All of the 64bit asm functions that take a stride as an argument ; via register, assume that the high dword of that register is filled with 0. ; This is true in practice (since we never do any 64bit arithmetic on strides, @@ -47,28 +67,16 @@ %endif %endmacro -; PIC support macros. -; x86_64 can't fit 64bit address literals in most instruction types, -; so shared objects (under the assumption that they might be anywhere -; in memory) must use an address mode that does fit. -; So all accesses to global variables must use this macro, e.g. -; mov eax, [foo GLOBAL] -; instead of -; mov eax, [foo] -; -; x86_32 doesn't require PIC. -; Some distros prefer shared objects to be PIC, but nothing breaks if -; the code contains a few textrels, so we'll skip that complexity. - %ifdef WIN64 %define PIC %elifndef ARCH_X86_64 +; x86_32 doesn't require PIC. +; Some distros prefer shared objects to be PIC, but nothing breaks if +; the code contains a few textrels, so we'll skip that complexity. %undef PIC %endif %ifdef PIC - %define GLOBAL wrt rip -%else - %define GLOBAL + default rel %endif ; Macros to eliminate most code duplication between x86_32 and x86_64: @@ -163,7 +171,7 @@ DECLARE_REG_SIZE bp, bpl %endrep %endmacro -DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7 +DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9 %ifdef ARCH_X86_64 %define gprsize 8 @@ -259,15 +267,11 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] %endif %endmacro -%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... +%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names... ASSERT %2 >= %1 %assign regs_used %2 ASSERT regs_used <= 7 - %if %0 > 2 - %assign xmm_regs_used %3 - %else - %assign xmm_regs_used 0 - %endif + %assign xmm_regs_used %3 ASSERT xmm_regs_used <= 16 %if regs_used > 4 push r4 @@ -388,7 +392,7 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] %endif %endmacro -%macro PROLOGUE 2-4+ ; #args, #regs, arg_names... +%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... ASSERT %2 >= %1 %assign regs_used %2 ASSERT regs_used <= 7 @@ -434,10 +438,7 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] ; Symbol prefix for C linkage %macro cglobal 1-2+ - %xdefine %1 ff_%1 - %ifdef PREFIX - %xdefine %1 _ %+ %1 - %endif + %xdefine %1 mangle(program_name %+ _ %+ %1) %xdefine %1.skip_prologue %1 %+ .skip_prologue %ifidn __OUTPUT_FORMAT__,elf global %1:function hidden @@ -454,21 +455,28 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] %endmacro %macro cextern 1 - %ifdef PREFIX - %xdefine %1 _%1 - %endif + %xdefine %1 mangle(program_name %+ _ %+ %1) + extern %1 +%endmacro + +;like cextern, but without the prefix +%macro cextern_naked 1 + %xdefine %1 mangle(%1) extern %1 %endmacro +%macro const 2+ + %xdefine %1 mangle(program_name %+ _ %+ %1) + global %1 + %1: %2 +%endmacro + ; This is needed for ELF, otherwise the GNU linker assumes the stack is ; executable by default. %ifidn __OUTPUT_FORMAT__,elf SECTION .note.GNU-stack noalloc noexec nowrite progbits %endif -%assign FENC_STRIDE 16 -%assign FDEC_STRIDE 32 - ; merge mmx and sse* %macro CAT_XDEFINE 3 @@ -575,7 +583,10 @@ INIT_MMX %endrep %endmacro -%macro SAVE_MM_PERMUTATION 1 +; If SAVE_MM_PERMUTATION is placed at the end of a function and given the +; function name, then any later calls to that function will automatically +; load the permutation, so values can be returned in mmregs. +%macro SAVE_MM_PERMUTATION 1 ; name to save as %assign %%i 0 %rep num_mmregs CAT_XDEFINE %1_m, %%i, m %+ %%i @@ -583,7 +594,7 @@ INIT_MMX %endrep %endmacro -%macro LOAD_MM_PERMUTATION 1 +%macro LOAD_MM_PERMUTATION 1 ; name to load from %assign %%i 0 %rep num_mmregs CAT_XDEFINE m, %%i, %1_m %+ %%i @@ -599,7 +610,7 @@ INIT_MMX %endif %endmacro -;Substitutions that reduce instruction size but are functionally equivalent +; Substitutions that reduce instruction size but are functionally equivalent %macro add 2 %ifnum %2 %if %2==128 diff --git a/libavcodec/x86/x86util.asm b/libavcodec/x86/x86util.asm index f3e0e2dbe4..a2c3b949c4 100644 --- a/libavcodec/x86/x86util.asm +++ b/libavcodec/x86/x86util.asm @@ -1,7 +1,10 @@ ;***************************************************************************** ;* x86util.asm ;***************************************************************************** -;* Copyright (C) 2008 Loren Merritt +;* Copyright (C) 2008 x264 project +;* +;* Authors: Holger Lubitz +;* Loren Merritt ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -18,6 +21,9 @@ ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;***************************************************************************** +%assign FENC_STRIDE 16 +%assign FDEC_STRIDE 32 + %macro SBUTTERFLY 4 mova m%4, m%2 punpckl%1 m%2, m%3 @@ -25,6 +31,13 @@ SWAP %3, %4 %endmacro +%macro SBUTTERFLY2 4 + mova m%4, m%2 + punpckh%1 m%2, m%3 + punpckl%1 m%4, m%3 + SWAP %2, %4, %3 +%endmacro + %macro TRANSPOSE4x4W 5 SBUTTERFLY wd, %1, %2, %5 SBUTTERFLY wd, %3, %4, %5 @@ -123,14 +136,40 @@ pabsw %2, %2 %endmacro -%define ABS1 ABS1_MMX -%define ABS2 ABS2_MMX +%macro ABSB_MMX 2 + pxor %2, %2 + psubb %2, %1 + pminub %1, %2 +%endmacro + +%macro ABSB2_MMX 4 + pxor %3, %3 + pxor %4, %4 + psubb %3, %1 + psubb %4, %2 + pminub %1, %3 + pminub %2, %4 +%endmacro + +%macro ABSB_SSSE3 2 + pabsb %1, %1 +%endmacro + +%macro ABSB2_SSSE3 4 + pabsb %1, %1 + pabsb %2, %2 +%endmacro %macro ABS4 6 ABS2 %1, %2, %5, %6 ABS2 %3, %4, %5, %6 %endmacro +%define ABS1 ABS1_MMX +%define ABS2 ABS2_MMX +%define ABSB ABSB_MMX +%define ABSB2 ABSB2_MMX + %macro SPLATB_MMX 3 movd %1, [%2-3] ;to avoid crossing a cacheline punpcklbw %1, %1 @@ -226,10 +265,10 @@ ; %3/%4: source regs ; %5/%6: tmp regs %ifidn %1, d -%define mask [mask_10 GLOBAL] +%define mask [mask_10] %define shift 16 %elifidn %1, q -%define mask [mask_1100 GLOBAL] +%define mask [mask_1100] %define shift 32 %endif %if %0==6 ; less dependency if we have two tmp @@ -383,10 +422,10 @@ %macro SUMSUBD2_AB 4 mova %4, %1 mova %3, %2 - psraw %2, 1 - psraw %1, 1 - paddw %2, %4 - psubw %1, %3 + psraw %2, 1 ; %2: %2>>1 + psraw %1, 1 ; %1: %1>>1 + paddw %2, %4 ; %2: %2>>1+%1 + psubw %1, %3 ; %1: %1>>1-%2 %endmacro %macro DCT4_1D 5 @@ -407,16 +446,27 @@ %macro IDCT4_1D 5-6 %ifnum %5 SUMSUBD2_AB m%2, m%4, m%6, m%5 + ; %2: %2>>1-%4 %4: %2+%4>>1 SUMSUB_BA m%3, m%1, m%6 + ; %3: %1+%3 %1: %1-%3 SUMSUB_BADC m%4, m%3, m%2, m%1, m%6 + ; %4: %1+%3 + (%2+%4>>1) + ; %3: %1+%3 - (%2+%4>>1) + ; %2: %1-%3 + (%2>>1-%4) + ; %1: %1-%3 - (%2>>1-%4) %else SUMSUBD2_AB m%2, m%4, [%5], [%5+16] SUMSUB_BA m%3, m%1 SUMSUB_BADC m%4, m%3, m%2, m%1 %endif SWAP %1, %4, %3 + ; %1: %1+%3 + (%2+%4>>1) row0 + ; %2: %1-%3 + (%2>>1-%4) row1 + ; %3: %1-%3 - (%2>>1-%4) row2 + ; %4: %1+%3 - (%2+%4>>1) row3 %endmacro + %macro LOAD_DIFF 5 %ifidn %3, none movh %1, %4 @@ -512,4 +562,3 @@ packuswb %1, %1 movh %4, %1 %endmacro -