vp3: port x86 SIMD to cpuflags.

13 years ago · 4a26fdd852
parent 42ade117dd
commit 4a26fdd852
1 changed files with 47 additions and 47 deletions
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@ -102,8 +102,8 @@ SECTION .text
    mov  [r0+r3  -1], r2w
 %endmacro
-INIT_MMX
+INIT_MMX mmx2
-cglobal vp3_v_loop_filter_mmx2, 3, 4
+cglobal vp3_v_loop_filter, 3, 4
 %if ARCH_X86_64
    movsxd        r1, r1d
 %endif
@ -120,7 +120,7 @@ cglobal vp3_v_loop_filter_mmx2, 3, 4
    movq     [r0   ], m3
    RET
-cglobal vp3_h_loop_filter_mmx2, 3, 4
+cglobal vp3_h_loop_filter, 3, 4
 %if ARCH_X86_64
    movsxd        r1, r1d
 %endif
@ -354,38 +354,6 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4
    movq        I(2), m2
 %endmacro
 %macro VP3_IDCT_mmx 1
    ; eax = quantized input
    ; ebx = dequantizer matrix
    ; ecx = IDCT constants
    ;  M(I) = ecx + MaskOffset(0) + I * 8
    ;  C(I) = ecx + CosineOffset(32) + (I-1) * 8
    ; edx = output
    ; r0..r7 = mm0..mm7
 %define OC_8 [pw_8]
 %define C(x) [vp3_idct_data+16*(x-1)]
    ; at this point, function has completed dequantization + dezigzag +
    ; partial transposition; now do the idct itself
 %define I(x) [%1+16* x     ]
 %define J(x) [%1+16*(x-4)+8]
    RowIDCT
    Transpose
 %define I(x) [%1+16* x   +64]
 %define J(x) [%1+16*(x-4)+72]
    RowIDCT
    Transpose
 %define I(x) [%1+16*x]
 %define J(x) [%1+16*x]
    ColumnIDCT
 %define I(x) [%1+16*x+8]
 %define J(x) [%1+16*x+8]
    ColumnIDCT
 %endmacro
 %macro VP3_1D_IDCT_SSE2 0
    movdqa        m2, I(3)      ; xmm2 = i3
    movdqa        m6, C(3)      ; xmm6 = c3
@ -501,7 +469,8 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4
    movdqa      O(7), m%8
 %endmacro
-%macro VP3_IDCT_sse2 1
+%macro VP3_IDCT 1
 %if mmsize == 16
 %define I(x) [%1+16*x]
 %define O(x) [%1+16*x]
 %define C(x) [vp3_idct_data+16*(x-1)]
@ -519,11 +488,42 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4
 %define ADD(x)   paddsw x, [pw_8]
        VP3_1D_IDCT_SSE2
        PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
 %else ; mmsize == 8
    ; eax = quantized input
    ; ebx = dequantizer matrix
    ; ecx = IDCT constants
    ;  M(I) = ecx + MaskOffset(0) + I * 8
    ;  C(I) = ecx + CosineOffset(32) + (I-1) * 8
    ; edx = output
    ; r0..r7 = mm0..mm7
 %define OC_8 [pw_8]
 %define C(x) [vp3_idct_data+16*(x-1)]
    ; at this point, function has completed dequantization + dezigzag +
    ; partial transposition; now do the idct itself
 %define I(x) [%1+16* x     ]
 %define J(x) [%1+16*(x-4)+8]
    RowIDCT
    Transpose
 %define I(x) [%1+16* x   +64]
 %define J(x) [%1+16*(x-4)+72]
    RowIDCT
    Transpose
 %define I(x) [%1+16*x]
 %define J(x) [%1+16*x]
    ColumnIDCT
 %define I(x) [%1+16*x+8]
 %define J(x) [%1+16*x+8]
    ColumnIDCT
 %endif ; mmsize == 16/8
 %endmacro
-%macro vp3_idct_funcs 1
+%macro vp3_idct_funcs 0
-cglobal vp3_idct_put_%1, 3, 4, 9
+cglobal vp3_idct_put, 3, 4, 9
-    VP3_IDCT_%1   r2
+    VP3_IDCT      r2
    movsxdifnidn  r1, r1d
    mova          m4, [pb_80]
@ -565,8 +565,8 @@ cglobal vp3_idct_put_%1, 3, 4, 9
 %endrep
    RET
-cglobal vp3_idct_add_%1, 3, 4, 9
+cglobal vp3_idct_add, 3, 4, 9
-    VP3_IDCT_%1   r2
+    VP3_IDCT      r2
    mov           r3, 4
    pxor          m4, m4
@ -607,10 +607,10 @@ cglobal vp3_idct_add_%1, 3, 4, 9
    RET
 %endmacro
-INIT_MMX
+INIT_MMX mmx
-vp3_idct_funcs mmx
+vp3_idct_funcs
-INIT_XMM
+INIT_XMM sse2
-vp3_idct_funcs sse2
+vp3_idct_funcs
 %macro DC_ADD 0
    movq          m2, [r0     ]
@ -631,8 +631,8 @@ vp3_idct_funcs sse2
    movq   [r0+r3  ], m5
 %endmacro
-INIT_MMX
+INIT_MMX mmx2
-cglobal vp3_idct_dc_add_mmx2, 3, 4
+cglobal vp3_idct_dc_add, 3, 4
 %if ARCH_X86_64
    movsxd        r1, r1d
 %endif