Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on

CPUs supporting it.

Originally committed as revision 24437 to svn://svn.ffmpeg.org/ffmpeg/trunk
oldabi
Ronald S. Bultje 15 years ago
parent 9bb9875eb7
commit dc5eec8085
  1. 4
      libavcodec/x86/vp8dsp-init.c
  2. 35
      libavcodec/x86/vp8dsp.asm

@ -247,6 +247,7 @@ DECLARE_LOOP_FILTER(mmx)
DECLARE_LOOP_FILTER(mmxext) DECLARE_LOOP_FILTER(mmxext)
DECLARE_LOOP_FILTER(sse2) DECLARE_LOOP_FILTER(sse2)
DECLARE_LOOP_FILTER(ssse3) DECLARE_LOOP_FILTER(ssse3)
DECLARE_LOOP_FILTER(sse4)
#endif #endif
@ -379,6 +380,9 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
if (mm_flags & FF_MM_SSE4) { if (mm_flags & FF_MM_SSE4) {
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4; c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4;
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4;
} }
#endif #endif
} }

@ -1932,10 +1932,24 @@ INNER_LOOPFILTER ssse3, h, 6, 8, 13
; write 4 or 8 words in the mmx/xmm registers as 8 lines ; write 4 or 8 words in the mmx/xmm registers as 8 lines
; 1 and 2 are the registers to write, this can be the same (for SSE2) ; 1 and 2 are the registers to write, this can be the same (for SSE2)
; for pre-SSE4:
; 3 is a general-purpose register that we will clobber ; 3 is a general-purpose register that we will clobber
; for SSE4:
; 3 is a pointer to the destination's 5th line
; 4 is a pointer to the destination's 4th line ; 4 is a pointer to the destination's 4th line
; 5 is -stride and +stride ; 5/6 is -stride and +stride
%macro WRITE_8W 6 ; 7 is optimization string
%macro WRITE_8W 7
%ifidn %7, sse4
pextrw [%4+%5*4], %1, 0
pextrw [%3+%5*4], %1, 1
pextrw [%4+%5*2], %1, 2
pextrw [%4+%5 ], %1, 3
pextrw [%4 ], %1, 4
pextrw [%3 ], %1, 5
pextrw [%3+%6 ], %1, 6
pextrw [%3+%6*2], %1, 7
%else
movd %3, %1 movd %3, %1
%if mmsize == 8 %if mmsize == 8
punpckhdq %1, %1 punpckhdq %1, %1
@ -1974,6 +1988,7 @@ INNER_LOOPFILTER ssse3, h, 6, 8, 13
%if mmsize == 8 %if mmsize == 8
add %4, %5 add %4, %5
%endif %endif
%endif
%endmacro %endmacro
%macro MBEDGE_LOOPFILTER 5 %macro MBEDGE_LOOPFILTER 5
@ -2509,14 +2524,17 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
%if mmsize == 8 ; mmx/mmxext (h) %if mmsize == 8 ; mmx/mmxext (h)
WRITE_4x2D 1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg WRITE_4x2D 1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg
add dst_reg, 4 add dst_reg, 4
WRITE_8W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg WRITE_8W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg, %4
%else ; sse2 (h) %else ; sse2 (h)
lea dst8_reg, [dst8_reg+mstride_reg+1] lea dst8_reg, [dst8_reg+mstride_reg+1]
WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
lea dst_reg, [dst2_reg+mstride_reg+4] lea dst_reg, [dst2_reg+mstride_reg+4]
lea dst8_reg, [dst8_reg+mstride_reg+4] lea dst8_reg, [dst8_reg+mstride_reg+4]
WRITE_8W m5, m5, dst2_reg, dst_reg, mstride_reg, stride_reg WRITE_8W m5, m5, dst2_reg, dst_reg, mstride_reg, stride_reg, %2
WRITE_8W m6, m6, dst2_reg, dst8_reg, mstride_reg, stride_reg %ifidn %2, sse4
lea dst_reg, [dst8_reg+ stride_reg]
%endif
WRITE_8W m6, m6, dst2_reg, dst8_reg, mstride_reg, stride_reg, %2
%endif %endif
%endif %endif
@ -2574,3 +2592,10 @@ MBEDGE_LOOPFILTER ssse3, h, 6, 16, 16
%endif %endif
MBEDGE_LOOPFILTER ssse3, v, 6, 8, 16 MBEDGE_LOOPFILTER ssse3, v, 6, 8, 16
MBEDGE_LOOPFILTER ssse3, h, 6, 8, 16 MBEDGE_LOOPFILTER ssse3, h, 6, 8, 16
%ifdef m8
MBEDGE_LOOPFILTER sse4, h, 5, 16, 16
%else
MBEDGE_LOOPFILTER sse4, h, 6, 16, 16
%endif
MBEDGE_LOOPFILTER sse4, h, 6, 8, 16

Loading…
Cancel
Save