x86/takdsp: add avx2 versions of all functions

On an Intel Core i7 12700k:

decorrelate_ls_c: 814.3
decorrelate_ls_sse2: 165.8
decorrelate_ls_avx2: 101.3
decorrelate_sf_c: 1602.6
decorrelate_sf_sse4: 640.1
decorrelate_sf_avx2: 324.6
decorrelate_sm_c: 1564.8
decorrelate_sm_sse2: 379.3
decorrelate_sm_avx2: 203.3
decorrelate_sr_c: 785.3
decorrelate_sr_sse2: 176.3
decorrelate_sr_avx2: 99.8

Tested-by: Lynne <dev@lynne.ee>
Signed-off-by: James Almer <jamrial@gmail.com>
release/7.0
James Almer 11 months ago
parent 370ce305f4
commit 591dc3b4b8
  1. 41
      libavcodec/x86/takdsp.asm
  2. 11
      libavcodec/x86/takdsp_init.c

@ -28,7 +28,7 @@ pd_128: times 4 dd 128
SECTION .text SECTION .text
INIT_XMM sse2 %macro TAK_DECORRELATE 0
cglobal tak_decorrelate_ls, 3, 3, 2, p1, p2, length cglobal tak_decorrelate_ls, 3, 3, 2, p1, p2, length
shl lengthd, 2 shl lengthd, 2
add p1q, lengthq add p1q, lengthq
@ -73,10 +73,8 @@ cglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length
mova m1, [p2q+lengthq] mova m1, [p2q+lengthq]
mova m3, [p1q+lengthq+mmsize] mova m3, [p1q+lengthq+mmsize]
mova m4, [p2q+lengthq+mmsize] mova m4, [p2q+lengthq+mmsize]
mova m2, m1 psrad m2, m1, 1
mova m5, m4 psrad m5, m4, 1
psrad m2, 1
psrad m5, 1
psubd m0, m2 psubd m0, m2
psubd m3, m5 psubd m3, m5
paddd m1, m0 paddd m1, m0
@ -88,29 +86,44 @@ cglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length
add lengthq, mmsize*2 add lengthq, mmsize*2
jl .loop jl .loop
RET RET
%endmacro
INIT_XMM sse4 INIT_XMM sse2
TAK_DECORRELATE
INIT_YMM avx2
TAK_DECORRELATE
%macro TAK_DECORRELATE_SF 0
cglobal tak_decorrelate_sf, 3, 3, 5, p1, p2, length, dshift, dfactor cglobal tak_decorrelate_sf, 3, 3, 5, p1, p2, length, dshift, dfactor
shl lengthd, 2 shl lengthd, 2
add p1q, lengthq add p1q, lengthq
add p2q, lengthq add p2q, lengthq
neg lengthq neg lengthq
movd m2, dshiftm movd xm2, dshiftm
movd m3, dfactorm %if UNIX64
pshufd m3, m3, 0 movd xm3, dfactorm
mova m4, [pd_128] VPBROADCASTD m3, xm3
%else
VPBROADCASTD m3, dfactorm
%endif
VBROADCASTI128 m4, [pd_128]
.loop: .loop:
mova m0, [p1q+lengthq]
mova m1, [p2q+lengthq] mova m1, [p2q+lengthq]
psrad m1, m2 psrad m1, xm2
pmulld m1, m3 pmulld m1, m3
paddd m1, m4 paddd m1, m4
psrad m1, 8 psrad m1, 8
pslld m1, m2 pslld m1, xm2
psubd m1, m0 psubd m1, [p1q+lengthq]
mova [p1q+lengthq], m1 mova [p1q+lengthq], m1
add lengthq, mmsize add lengthq, mmsize
jl .loop jl .loop
RET RET
%endmacro
INIT_XMM sse4
TAK_DECORRELATE_SF
INIT_YMM avx2
TAK_DECORRELATE_SF

@ -24,9 +24,13 @@
#include "config.h" #include "config.h"
void ff_tak_decorrelate_ls_sse2(const int32_t *p1, int32_t *p2, int length); void ff_tak_decorrelate_ls_sse2(const int32_t *p1, int32_t *p2, int length);
void ff_tak_decorrelate_ls_avx2(const int32_t *p1, int32_t *p2, int length);
void ff_tak_decorrelate_sr_sse2(int32_t *p1, const int32_t *p2, int length); void ff_tak_decorrelate_sr_sse2(int32_t *p1, const int32_t *p2, int length);
void ff_tak_decorrelate_sr_avx2(int32_t *p1, const int32_t *p2, int length);
void ff_tak_decorrelate_sm_sse2(int32_t *p1, int32_t *p2, int length); void ff_tak_decorrelate_sm_sse2(int32_t *p1, int32_t *p2, int length);
void ff_tak_decorrelate_sm_avx2(int32_t *p1, int32_t *p2, int length);
void ff_tak_decorrelate_sf_sse4(int32_t *p1, const int32_t *p2, int length, int dshift, int dfactor); void ff_tak_decorrelate_sf_sse4(int32_t *p1, const int32_t *p2, int length, int dshift, int dfactor);
void ff_tak_decorrelate_sf_avx2(int32_t *p1, const int32_t *p2, int length, int dshift, int dfactor);
av_cold void ff_takdsp_init_x86(TAKDSPContext *c) av_cold void ff_takdsp_init_x86(TAKDSPContext *c)
{ {
@ -42,5 +46,12 @@ av_cold void ff_takdsp_init_x86(TAKDSPContext *c)
if (EXTERNAL_SSE4(cpu_flags)) { if (EXTERNAL_SSE4(cpu_flags)) {
c->decorrelate_sf = ff_tak_decorrelate_sf_sse4; c->decorrelate_sf = ff_tak_decorrelate_sf_sse4;
} }
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
c->decorrelate_ls = ff_tak_decorrelate_ls_avx2;
c->decorrelate_sr = ff_tak_decorrelate_sr_avx2;
c->decorrelate_sm = ff_tak_decorrelate_sm_avx2;
c->decorrelate_sf = ff_tak_decorrelate_sf_avx2;
}
#endif #endif
} }

Loading…
Cancel
Save