From e3fd6a3a4e3d28d8a50bb6ec3e19449bc4e0d3db Mon Sep 17 00:00:00 2001 From: Pascal Massimino Date: Tue, 9 Sep 2014 14:38:58 +0200 Subject: [PATCH] av_filter/x86/idet: MMX/SSE2 implementation of 16bits filter_line() tested on http://ps-auxw.de/10bit-h264-sample/10bit-eldorado.mkv MMX: ~30% faster decoding overall SSE2:~40% faster Signed-off-by: Michael Niedermayer --- libavfilter/vf_idet.c | 11 ++++-- libavfilter/vf_idet.h | 7 +++- libavfilter/x86/vf_idet.asm | 70 +++++++++++++++++++++++++++++++++- libavfilter/x86/vf_idet_init.c | 29 +++++++++++--- 4 files changed, 103 insertions(+), 14 deletions(-) diff --git a/libavfilter/vf_idet.c b/libavfilter/vf_idet.c index 4416228431..22ff494dfb 100644 --- a/libavfilter/vf_idet.c +++ b/libavfilter/vf_idet.c @@ -61,7 +61,7 @@ int ff_idet_filter_line_c(const uint8_t *a, const uint8_t *b, const uint8_t *c, return ret; } -static int filter_line_c_16bit(const uint16_t *a, const uint16_t *b, const uint16_t *c, int w) +int ff_idet_filter_line_c_16bit(const uint16_t *a, const uint16_t *b, const uint16_t *c, int w) { int x; int ret=0; @@ -169,8 +169,11 @@ static int filter_frame(AVFilterLink *link, AVFrame *picref) if (!idet->csp) idet->csp = av_pix_fmt_desc_get(link->format); - if (idet->csp->comp[0].depth_minus1 / 8 == 1) - idet->filter_line = (void*)filter_line_c_16bit; + if (idet->csp->comp[0].depth_minus1 / 8 == 1){ + idet->filter_line = (ff_idet_filter_func)ff_idet_filter_line_c_16bit; + if (ARCH_X86) + ff_idet_init_x86(idet, 1); + } filter(ctx); @@ -245,7 +248,7 @@ static av_cold int init(AVFilterContext *ctx) idet->filter_line = ff_idet_filter_line_c; if (ARCH_X86) - ff_idet_init_x86(idet); + ff_idet_init_x86(idet, 0); return 0; } diff --git a/libavfilter/vf_idet.h b/libavfilter/vf_idet.h index 05506901f2..c5799fb67d 100644 --- a/libavfilter/vf_idet.h +++ b/libavfilter/vf_idet.h @@ -24,6 +24,8 @@ #define HIST_SIZE 4 +typedef int (*ff_idet_filter_func)(const uint8_t *a, const uint8_t *b, const uint8_t *c, int w); + typedef enum { TFF, BFF, @@ -45,14 +47,15 @@ typedef struct { AVFrame *cur; AVFrame *next; AVFrame *prev; - int (*filter_line)(const uint8_t *prev, const uint8_t *cur, const uint8_t *next, int w); + ff_idet_filter_func filter_line; const AVPixFmtDescriptor *csp; } IDETContext; -void ff_idet_init_x86(IDETContext *idet); +void ff_idet_init_x86(IDETContext *idet, int for_16b); /* main fall-back for left-over */ int ff_idet_filter_line_c(const uint8_t *a, const uint8_t *b, const uint8_t *c, int w); +int ff_idet_filter_line_c_16bit(const uint16_t *a, const uint16_t *b, const uint16_t *c, int w); #endif diff --git a/libavfilter/x86/vf_idet.asm b/libavfilter/x86/vf_idet.asm index 14b16c5779..4649cae030 100644 --- a/libavfilter/x86/vf_idet.asm +++ b/libavfilter/x86/vf_idet.asm @@ -25,8 +25,6 @@ SECTION_TEXT -%if ARCH_X86_32 - ; Implementation that does 8-bytes at a time using single-word operations. %macro IDET_FILTER_LINE 1 INIT_MMX %1 @@ -78,11 +76,79 @@ cglobal idet_filter_line, 4, 5, 0, a, b, c, width, index RET %endmacro +%if ARCH_X86_32 IDET_FILTER_LINE mmxext IDET_FILTER_LINE mmx %endif +;****************************************************************************** +; 16bit implementation that does 4/8-pixels at a time + +%macro PABS_DIFF_WD 3 ; a, b, junk , output=a + psubusw %3, %2, %1 + psubusw %1, %2 + por %1, %3 + + mova %2, %1 + punpcklwd %1, m_zero + punpckhwd %2, m_zero + paddd %1, %2 +%endmacro + +%macro IDET_FILTER_LINE_16BIT 1 ; %1=increment (4 or 8 words) +cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index + xor indexq, indexq +%define m_zero m1 +%define m_sum m0 + pxor m_sum, m_sum + pxor m_zero, m_zero + +.loop_16bit: + movu m2, [bq + indexq * 2] ; B + movu m3, [aq + indexq * 2] ; A + mova m6, m2 + psubusw m5, m2, m3 ; ba + + movu m4, [cq + indexq * 2] ; C + add indexq, %1 + psubusw m3, m2 ; ab + CMP indexd, widthd + + psubusw m6, m4 ; bc + psubusw m4, m2 ; cb + + PABS_DIFF_WD m3, m6, m7 ; |ab - bc| + PABS_DIFF_WD m5, m4, m7 ; |ba - cb| + paddd m_sum, m3 + paddd m_sum, m5 + jl .loop_16bit + + mova m2, m_sum +%if mmsize == 16 + psrldq m2, 4 + paddd m_sum, m2 + psrldq m2, 4 + paddd m_sum, m2 + psrldq m2, 4 + paddd m_sum, m2 +%else + psrlq m2, 32 + paddd m_sum, m2 +%endif + movd eax, m_sum + RET +%endmacro + +INIT_XMM sse2 +IDET_FILTER_LINE_16BIT 8 +%if ARCH_X86_32 +INIT_MMX mmx +IDET_FILTER_LINE_16BIT 4 +%endif + +;****************************************************************************** ; SSE2 8-bit implementation that does 16-bytes at a time: + INIT_XMM sse2 cglobal idet_filter_line, 4, 6, 7, a, b, c, width, index, total xor indexq, indexq diff --git a/libavfilter/x86/vf_idet_init.c b/libavfilter/x86/vf_idet_init.c index fb9ad832b0..1147ca8ba8 100644 --- a/libavfilter/x86/vf_idet_init.c +++ b/libavfilter/x86/vf_idet_init.c @@ -23,6 +23,8 @@ #include "libavutil/x86/cpu.h" #include "libavfilter/vf_idet.h" +#if HAVE_YASM + /* declares main callable idet_filter_line_{mmx,mmxext,sse2}() */ #define FUNC_MAIN_DECL(KIND, SPAN) \ int ff_idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b, \ @@ -39,32 +41,47 @@ static int idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b, \ return sum; \ } -#if HAVE_YASM + +#define FUNC_MAIN_DECL_16bit(KIND, SPAN) \ +int ff_idet_filter_line_16bit_##KIND(const uint16_t *a, const uint16_t *b, \ + const uint16_t *c, int w); \ +static int idet_filter_line_16bit_##KIND(const uint16_t *a, const uint16_t *b, \ + const uint16_t *c, int w) { \ + int sum = 0; \ + const int left_over = w & (SPAN - 1); \ + w -= left_over; \ + if (w > 0) \ + sum += ff_idet_filter_line_16bit_##KIND(a, b, c, w); \ + if (left_over > 0) \ + sum += ff_idet_filter_line_c_16bit(a + w, b + w, c + w, left_over); \ + return sum; \ +} FUNC_MAIN_DECL(sse2, 16) +FUNC_MAIN_DECL_16bit(sse2, 8) #if ARCH_X86_32 FUNC_MAIN_DECL(mmx, 8) FUNC_MAIN_DECL(mmxext, 8) +FUNC_MAIN_DECL_16bit(mmx, 4) #endif #endif - -av_cold void ff_idet_init_x86(IDETContext *idet) +av_cold void ff_idet_init_x86(IDETContext *idet, int for_16b) { #if HAVE_YASM const int cpu_flags = av_get_cpu_flags(); #if ARCH_X86_32 if (EXTERNAL_MMX(cpu_flags)) { - idet->filter_line = idet_filter_line_mmx; + idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_mmx : idet_filter_line_mmx; } if (EXTERNAL_MMXEXT(cpu_flags)) { - idet->filter_line = idet_filter_line_mmxext; + idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_mmx : idet_filter_line_mmxext; } #endif // ARCH_x86_32 if (EXTERNAL_SSE2(cpu_flags)) { - idet->filter_line = idet_filter_line_sse2; + idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_sse2 : idet_filter_line_sse2; } #endif // HAVE_YASM }