diff --git a/libswresample/rematrix.c b/libswresample/rematrix.c index 18e89c96ba..b9c5a4cf50 100644 --- a/libswresample/rematrix.c +++ b/libswresample/rematrix.c @@ -380,7 +380,7 @@ int swri_rematrix(SwrContext *s, AudioData *out, AudioData *in, int len, int mus in_i= s->matrix_ch[out_i][1]; if(s->matrix[out_i][in_i]!=1.0){ if(s->mix_1_1_simd && len1) - s->mix_1_1_simd(out->ch[out_i] , in->ch[in_i] , s->native_matrix, in->ch_count*out_i + in_i, len1); + s->mix_1_1_simd(out->ch[out_i] , in->ch[in_i] , s->native_simd_matrix, in->ch_count*out_i + in_i, len1); if(len != len1) s->mix_1_1_f (out->ch[out_i]+off, in->ch[in_i]+off, s->native_matrix, in->ch_count*out_i + in_i, len-len1); }else if(mustcopy){ diff --git a/libswresample/x86/rematrix.asm b/libswresample/x86/rematrix.asm index e6f0b2fab6..c96ce49d9e 100644 --- a/libswresample/x86/rematrix.asm +++ b/libswresample/x86/rematrix.asm @@ -21,6 +21,12 @@ %include "libavutil/x86/x86inc.asm" %include "libavutil/x86/x86util.asm" + +SECTION_RODATA +align 32 +dw1: times 8 dd 1 +w1 : times 16 dw 1 + SECTION .text %macro MIX2_FLT 1 @@ -99,6 +105,63 @@ mix_1_1_float_u_int %+ SUFFIX REP_RET %endmacro +%macro MIX1_INT16 1 +cglobal mix_1_1_%1_int16, 5, 5, 6, out, in, coeffp, index, len +%ifidn %1, a + test inq, mmsize-1 + jne mix_1_1_int16_u_int %+ SUFFIX + test outq, mmsize-1 + jne mix_1_1_int16_u_int %+ SUFFIX +%else +mix_1_1_int16_u_int %+ SUFFIX +%endif + movd m4, [coeffpq + 4*indexq] + SPLATW m5, m4 + psllq m4, 32 + psrlq m4, 48 + mova m0, [w1] + psllw m0, m4 + psrlw m0, 1 + punpcklwd m5, m0 + add lenq , lenq + add inq , lenq + add outq , lenq + neg lenq +.next: + mov%1 m0, [inq + lenq ] + mov%1 m2, [inq + lenq + mmsize] + mova m1, m0 + mova m3, m2 + punpcklwd m0, [w1] + punpckhwd m1, [w1] + punpcklwd m2, [w1] + punpckhwd m3, [w1] + pmaddwd m0, m5 + pmaddwd m1, m5 + pmaddwd m2, m5 + pmaddwd m3, m5 + psrad m0, m4 + psrad m1, m4 + psrad m2, m4 + psrad m3, m4 + packssdw m0, m1 + packssdw m2, m3 + mov%1 [outq + lenq ], m0 + mov%1 [outq + lenq + mmsize], m2 + add lenq, mmsize*2 + jl .next +%if mmsize == 8 + emms + RET +%else + REP_RET +%endif +%endmacro + +INIT_MMX mmx +MIX1_INT16 u +MIX1_INT16 a + INIT_XMM sse MIX2_FLT u MIX2_FLT a diff --git a/libswresample/x86/swresample_x86.c b/libswresample/x86/swresample_x86.c index 18c601f72e..ba0f1f131a 100644 --- a/libswresample/x86/swresample_x86.c +++ b/libswresample/x86/swresample_x86.c @@ -163,6 +163,21 @@ void swri_rematrix_init_x86(struct SwrContext *s){ s->mix_2_1_simd = NULL; if (s->midbuf.fmt == AV_SAMPLE_FMT_S16P){ + if(mm_flags & AV_CPU_FLAG_MMX) { + s->mix_1_1_simd = ff_mix_1_1_a_int16_mmx; + } + s->native_simd_matrix = av_mallocz(2 * num * sizeof(int16_t)); + for(i=0; inative_matrix)[i * nb_in + j])); + sh = FFMAX(av_log2(sh) - 14, 0); + for(j=0; jnative_simd_matrix)[2*(i * nb_in + j)+1] = 15 - sh; + ((int16_t*)s->native_simd_matrix)[2*(i * nb_in + j)] = + ((((int*)s->native_matrix)[i * nb_in + j]) + (1<>1)) >> sh; + } + } } else if(s->midbuf.fmt == AV_SAMPLE_FMT_FLTP){ if(mm_flags & AV_CPU_FLAG_SSE) { s->mix_1_1_simd = ff_mix_1_1_a_float_sse;