vc1dsp: Port ff_vc1_put_ver_16b_shift2_mmx to yasm

This function is only used within other inline asm functions, hence the HAVE_MMX_INLINE guard. Per recent discussions, we should not worry about the performance of inline asm-only builds.
9 years ago · ab5f43e634
parent 12628e3369
commit ab5f43e634
2 changed files with 95 additions and 53 deletions
--- a/libavcodec/x86/vc1dsp.asm
+++ b/libavcodec/x86/vc1dsp.asm
@ -1,5 +1,6 @@
 ;******************************************************************************
-;* VC1 deblocking optimizations
+;* VC1 DSP optimizations
 ;* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
 ;* Copyright (c) 2009 David Conrad
 ;*
 ;* This file is part of FFmpeg.
@ -23,6 +24,7 @@
 cextern pw_4
 cextern pw_5
 cextern pw_9
 section .text
@ -315,3 +317,88 @@ cglobal vc1_h_loop_filter8, 3,5,8
    START_H_FILTER 8
    VC1_H_LOOP_FILTER 8
    RET
 %if HAVE_MMX_INLINE
 %macro NORMALIZE_MMX 1 ; shift
    paddw m3, m7 ; +bias-r
    paddw m4, m7 ; +bias-r
    psraw m3, %1
    psraw m4, %1
 %endmacro
 ; Compute the rounder 32-r or 8-r and unpacks it to m7
 %macro LOAD_ROUNDER_MMX 1 ; round
    movd      m7, %1
    punpcklwd m7, m7
    punpckldq m7, m7
 %endmacro
 %macro SHIFT2_LINE 5 ; off, r0, r1, r2, r3
    paddw          m%3, m%4
    movh           m%2, [srcq + stride_neg2]
    pmullw         m%3, m6
    punpcklbw      m%2, m0
    movh           m%5, [srcq + strideq]
    psubw          m%3, m%2
    punpcklbw      m%5, m0
    paddw          m%3, m7
    psubw          m%3, m%5
    psraw          m%3, shift
    movu   [dstq + %1], m%3
    add           srcq, strideq
 %endmacro
 INIT_MMX mmx
 ; void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst, const uint8_t *src,
 ;                                    x86_reg stride, int rnd, int64_t shift)
 ; Sacrificing m6 makes it possible to pipeline loads from src
 %if ARCH_X86_32
 cglobal vc1_put_ver_16b_shift2, 3,6,0, dst, src, stride
    DECLARE_REG_TMP     3, 4, 5
    %define rnd r3mp
    %define shift qword r4m
 %else ; X86_64
 cglobal vc1_put_ver_16b_shift2, 4,7,0, dst, src, stride
    DECLARE_REG_TMP     4, 5, 6
    %define   rnd r3d
    ; We need shift either in memory or in a mm reg as it's used in psraw
    ; On WIN64, the arg is already on the stack
    ; On UNIX64, m5 doesn't seem to be used
 %if WIN64
    %define shift r4mp
 %else ; UNIX64
    %define shift m5
    mova shift, r4q
 %endif ; WIN64
 %endif ; X86_32
 %define stride_neg2 t0q
 %define stride_9minus4 t1q
 %define i t2q
    mov       stride_neg2, strideq
    neg       stride_neg2
    add       stride_neg2, stride_neg2
    lea    stride_9minus4, [strideq * 9 - 4]
    mov                 i, 3
    LOAD_ROUNDER_MMX  rnd
    mova               m6, [pw_9]
    pxor               m0, m0
 .loop:
    movh               m2, [srcq]
    add              srcq, strideq
    movh               m3, [srcq]
    punpcklbw          m2, m0
    punpcklbw          m3, m0
    SHIFT2_LINE         0, 1, 2, 3, 4
    SHIFT2_LINE        24, 2, 3, 4, 1
    SHIFT2_LINE        48, 3, 4, 1, 2
    SHIFT2_LINE        72, 4, 1, 2, 3
    SHIFT2_LINE        96, 1, 2, 3, 4
    SHIFT2_LINE       120, 2, 3, 4, 1
    SHIFT2_LINE       144, 3, 4, 1, 2
    SHIFT2_LINE       168, 4, 1, 2, 3
    sub              srcq, stride_9minus4
    add              dstq, 8
    dec                 i
        jnz         .loop
    REP_RET
 %endif ; HAVE_MMX_INLINE
--- a/libavcodec/x86/vc1dsp_mmx.c
+++ b/libavcodec/x86/vc1dsp_mmx.c
@ -33,7 +33,11 @@
 #include "fpel.h"
 #include "vc1dsp.h"
-#if HAVE_6REGS && HAVE_INLINE_ASM
+#if HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL
 void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst,
                                   const uint8_t *src, x86_reg stride,
                                   int rnd, int64_t shift);
 #define OP_PUT(S,D)
 #define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
@ -66,55 +70,6 @@
     "punpcklwd %%mm7, %%mm7           \n\t"    \
     "punpckldq %%mm7, %%mm7           \n\t"
 #define SHIFT2_LINE(OFF, R0,R1,R2,R3)           \
    "paddw     %%mm"#R2", %%mm"#R1"    \n\t"    \
    "movd      (%0,%3), %%mm"#R0"      \n\t"    \
    "pmullw    %%mm6, %%mm"#R1"        \n\t"    \
    "punpcklbw %%mm0, %%mm"#R0"        \n\t"    \
    "movd      (%0,%2), %%mm"#R3"      \n\t"    \
    "psubw     %%mm"#R0", %%mm"#R1"    \n\t"    \
    "punpcklbw %%mm0, %%mm"#R3"        \n\t"    \
    "paddw     %%mm7, %%mm"#R1"        \n\t"    \
    "psubw     %%mm"#R3", %%mm"#R1"    \n\t"    \
    "psraw     %4, %%mm"#R1"           \n\t"    \
    "movq      %%mm"#R1", "#OFF"(%1)   \n\t"    \
    "add       %2, %0                  \n\t"
 /** Sacrificing mm6 makes it possible to pipeline loads from src */
 static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
                                       const uint8_t *src, x86_reg stride,
                                       int rnd, int64_t shift)
 {
    __asm__ volatile(
        "mov       $3, %%"REG_c"           \n\t"
        LOAD_ROUNDER_MMX("%5")
        "movq      "MANGLE(ff_pw_9)", %%mm6 \n\t"
        "1:                                \n\t"
        "movd      (%0), %%mm2             \n\t"
        "add       %2, %0                  \n\t"
        "movd      (%0), %%mm3             \n\t"
        "punpcklbw %%mm0, %%mm2            \n\t"
        "punpcklbw %%mm0, %%mm3            \n\t"
        SHIFT2_LINE(  0, 1, 2, 3, 4)
        SHIFT2_LINE( 24, 2, 3, 4, 1)
        SHIFT2_LINE( 48, 3, 4, 1, 2)
        SHIFT2_LINE( 72, 4, 1, 2, 3)
        SHIFT2_LINE( 96, 1, 2, 3, 4)
        SHIFT2_LINE(120, 2, 3, 4, 1)
        SHIFT2_LINE(144, 3, 4, 1, 2)
        SHIFT2_LINE(168, 4, 1, 2, 3)
        "sub       %6, %0                  \n\t"
        "add       $8, %1                  \n\t"
        "dec       %%"REG_c"               \n\t"
        "jnz 1b                            \n\t"
        : "+r"(src), "+r"(dst)
        : "r"(stride), "r"(-2*stride),
          "m"(shift), "m"(rnd), "r"(9*stride-4)
          NAMED_CONSTRAINTS_ADD(ff_pw_9)
        : "%"REG_c, "memory"
    );
 }
 /**
 * Data is already unpacked, so some operations can directly be made from
 * memory.
@ -430,7 +385,7 @@ static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
                               int hmode, int vmode, int rnd)\
 {\
    static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
-         { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
+         { NULL, vc1_put_ver_16b_shift1_mmx, ff_vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
    static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
         { NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\
    static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\
@ -780,4 +735,4 @@ av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)
    dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext;
    dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext;
 }
-#endif /* HAVE_6REGS && HAVE_INLINE_ASM */
+#endif /* HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL */