From 74f8d9aaef9150ead953e156e1c366d2a933ec1c Mon Sep 17 00:00:00 2001 From: Timothy Gu Date: Tue, 9 Feb 2016 02:20:56 +0000 Subject: [PATCH] x86/vf_blend: Add SSE2 optimization for screen 10x faster than C. Reviewed-by: Paul B Mahol --- libavfilter/x86/vf_blend.asm | 29 +++++++++++++++++++++++++++++ libavfilter/x86/vf_blend_init.c | 2 ++ 2 files changed, 31 insertions(+) diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm index 50b5f8a197..a5ea74c5bc 100644 --- a/libavfilter/x86/vf_blend.asm +++ b/libavfilter/x86/vf_blend.asm @@ -111,6 +111,13 @@ BLEND_END psrlw %1, 8 ; 00xx00xx a * b / 255 %endmacro +%macro SCREEN 4 ; a, b, pw_1, pw_255 + pxor %1, %4 ; 00xx00xx 255 - a + pxor %2, %4 + MULTIPLY %1, %2, %3 + pxor %1, %4 ; 00xx00xx 255 - x / 255 +%endmacro + BLEND_INIT multiply, 4 pxor m2, m2 mova m3, [pw_1] @@ -134,6 +141,28 @@ BLEND_INIT multiply, 4 jl .loop BLEND_END +BLEND_INIT screen, 5 + pxor m2, m2 + mova m3, [pw_1] + mova m4, [pw_255] +.nextrow: + mov xq, widthq + + .loop: + movh m0, [topq + xq] ; 0000xxxx + movh m1, [bottomq + xq] + punpcklbw m0, m2 ; 00xx00xx + punpcklbw m1, m2 + + SCREEN m0, m1, m3, m4 + + packuswb m0, m0 ; 0000xxxx + movh [dstq + xq], m0 + add xq, mmsize / 2 + + jl .loop +BLEND_END + BLEND_INIT average, 3 pxor m2, m2 .nextrow: diff --git a/libavfilter/x86/vf_blend_init.c b/libavfilter/x86/vf_blend_init.c index 8ac526aacd..a6baf94f42 100644 --- a/libavfilter/x86/vf_blend_init.c +++ b/libavfilter/x86/vf_blend_init.c @@ -37,6 +37,7 @@ BLEND_FUNC(and, sse2) BLEND_FUNC(darken, sse2) BLEND_FUNC(difference128, sse2) BLEND_FUNC(multiply, sse2) +BLEND_FUNC(screen, sse2) BLEND_FUNC(hardmix, sse2) BLEND_FUNC(lighten, sse2) BLEND_FUNC(or, sse2) @@ -65,6 +66,7 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit) case BLEND_MULTIPLY: param->blend = ff_blend_multiply_sse2; break; case BLEND_OR: param->blend = ff_blend_or_sse2; break; case BLEND_PHOENIX: param->blend = ff_blend_phoenix_sse2; break; + case BLEND_SCREEN: param->blend = ff_blend_screen_sse2; break; case BLEND_SUBTRACT: param->blend = ff_blend_subtract_sse2; break; case BLEND_XOR: param->blend = ff_blend_xor_sse2; break; case BLEND_DIFFERENCE: param->blend = ff_blend_difference_sse2; break;