mirror of https://github.com/FFmpeg/FFmpeg.git
parent
77f359670f
commit
344d519040
9 changed files with 708 additions and 165 deletions
@ -0,0 +1,25 @@ |
||||
/*
|
||||
* VP9 SIMD optimizations |
||||
* |
||||
* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#define BPC 10 |
||||
#define INIT_FUNC ff_vp9dsp_init_10bpp_x86 |
||||
#include "vp9dsp_init_16bpp_template.c" |
@ -0,0 +1,25 @@ |
||||
/*
|
||||
* VP9 SIMD optimizations |
||||
* |
||||
* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#define BPC 12 |
||||
#define INIT_FUNC ff_vp9dsp_init_12bpp_x86 |
||||
#include "vp9dsp_init_16bpp_template.c" |
@ -0,0 +1,62 @@ |
||||
/*
|
||||
* VP9 SIMD optimizations |
||||
* |
||||
* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#include "libavutil/attributes.h" |
||||
#include "libavutil/cpu.h" |
||||
#include "libavutil/mem.h" |
||||
#include "libavutil/x86/cpu.h" |
||||
#include "libavcodec/vp9dsp.h" |
||||
#include "libavcodec/x86/vp9dsp_init.h" |
||||
|
||||
#if HAVE_YASM |
||||
|
||||
extern const int16_t ff_filters_16bpp[3][15][4][16]; |
||||
|
||||
decl_mc_funcs(4, sse2, int16_t, 16, BPC); |
||||
decl_mc_funcs(8, sse2, int16_t, 16, BPC); |
||||
|
||||
mc_rep_funcs(16, 8, 16, sse2, int16_t, 16, BPC); |
||||
mc_rep_funcs(32, 16, 32, sse2, int16_t, 16, BPC); |
||||
mc_rep_funcs(64, 32, 64, sse2, int16_t, 16, BPC); |
||||
|
||||
filters_8tap_2d_fn2(put, 16, BPC, 2, sse2, sse2, 16bpp) |
||||
filters_8tap_2d_fn2(avg, 16, BPC, 2, sse2, sse2, 16bpp) |
||||
|
||||
filters_8tap_1d_fn3(put, BPC, sse2, sse2, 16bpp) |
||||
filters_8tap_1d_fn3(avg, BPC, sse2, sse2, 16bpp) |
||||
|
||||
#endif /* HAVE_YASM */ |
||||
|
||||
av_cold void INIT_FUNC(VP9DSPContext *dsp) |
||||
{ |
||||
#if HAVE_YASM |
||||
int cpu_flags = av_get_cpu_flags(); |
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) { |
||||
init_subpel3(0, put, BPC, sse2); |
||||
init_subpel3(1, avg, BPC, sse2); |
||||
} |
||||
|
||||
#endif /* HAVE_YASM */ |
||||
|
||||
ff_vp9dsp_init_16bpp_x86(dsp); |
||||
} |
@ -0,0 +1,421 @@ |
||||
;****************************************************************************** |
||||
;* VP9 MC SIMD optimizations |
||||
;* |
||||
;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com> |
||||
;* |
||||
;* This file is part of FFmpeg. |
||||
;* |
||||
;* FFmpeg is free software; you can redistribute it and/or |
||||
;* modify it under the terms of the GNU Lesser General Public |
||||
;* License as published by the Free Software Foundation; either |
||||
;* version 2.1 of the License, or (at your option) any later version. |
||||
;* |
||||
;* FFmpeg is distributed in the hope that it will be useful, |
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
;* Lesser General Public License for more details. |
||||
;* |
||||
;* You should have received a copy of the GNU Lesser General Public |
||||
;* License along with FFmpeg; if not, write to the Free Software |
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
;****************************************************************************** |
||||
|
||||
%include "libavutil/x86/x86util.asm" |
||||
|
||||
SECTION_RODATA 32 |
||||
|
||||
pw_4095: times 16 dw 0xfff |
||||
pd_64: times 8 dd 64 |
||||
|
||||
cextern pw_1023 |
||||
|
||||
SECTION .text |
||||
|
||||
%macro filter_h4_fn 1-2 12 |
||||
cglobal vp9_%1_8tap_1d_h_4_10, 6, 6, %2, dst, dstride, src, sstride, h, filtery |
||||
mova m5, [pw_1023] |
||||
.body: |
||||
%if notcpuflag(sse4) && ARCH_X86_64 |
||||
pxor m11, m11 |
||||
%endif |
||||
mova m6, [pd_64] |
||||
mova m7, [filteryq+ 0] |
||||
%if ARCH_X86_64 && mmsize > 8 |
||||
mova m8, [filteryq+32] |
||||
mova m9, [filteryq+64] |
||||
mova m10, [filteryq+96] |
||||
%endif |
||||
.loop: |
||||
movh m0, [srcq-6] |
||||
movh m1, [srcq-4] |
||||
movh m2, [srcq-2] |
||||
movh m3, [srcq+0] |
||||
movh m4, [srcq+2] |
||||
punpcklwd m0, m1 |
||||
punpcklwd m2, m3 |
||||
pmaddwd m0, m7 |
||||
%if ARCH_X86_64 && mmsize > 8 |
||||
pmaddwd m2, m8 |
||||
%else |
||||
pmaddwd m2, [filteryq+32] |
||||
%endif |
||||
movu m1, [srcq+4] |
||||
movu m3, [srcq+6] |
||||
paddd m0, m2 |
||||
movu m2, [srcq+8] |
||||
add srcq, sstrideq |
||||
punpcklwd m4, m1 |
||||
punpcklwd m3, m2 |
||||
%if ARCH_X86_64 && mmsize > 8 |
||||
pmaddwd m4, m9 |
||||
pmaddwd m3, m10 |
||||
%else |
||||
pmaddwd m4, [filteryq+64] |
||||
pmaddwd m3, [filteryq+96] |
||||
%endif |
||||
paddd m0, m4 |
||||
paddd m0, m3 |
||||
paddd m0, m6 |
||||
psrad m0, 7 |
||||
%if cpuflag(sse4) |
||||
packusdw m0, m0 |
||||
%else |
||||
packssdw m0, m0 |
||||
%endif |
||||
%ifidn %1, avg |
||||
movh m1, [dstq] |
||||
%endif |
||||
pminsw m0, m5 |
||||
%if notcpuflag(sse4) |
||||
%if ARCH_X86_64 |
||||
pmaxsw m0, m11 |
||||
%else |
||||
pxor m2, m2 |
||||
pmaxsw m0, m2 |
||||
%endif |
||||
%endif |
||||
%ifidn %1, avg |
||||
pavgw m0, m1 |
||||
%endif |
||||
movh [dstq], m0 |
||||
add dstq, dstrideq |
||||
dec hd |
||||
jg .loop |
||||
RET |
||||
|
||||
cglobal vp9_%1_8tap_1d_h_4_12, 6, 6, %2, dst, dstride, src, sstride, h, filtery |
||||
mova m5, [pw_4095] |
||||
jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_4_10 %+ SUFFIX).body |
||||
%endmacro |
||||
|
||||
INIT_XMM sse2 |
||||
filter_h4_fn put |
||||
filter_h4_fn avg |
||||
|
||||
%macro filter_h_fn 1-2 12 |
||||
%assign %%px mmsize/2 |
||||
cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _10, 6, 6, %2, dst, dstride, src, sstride, h, filtery |
||||
mova m5, [pw_1023] |
||||
.body: |
||||
%if notcpuflag(sse4) && ARCH_X86_64 |
||||
pxor m11, m11 |
||||
%endif |
||||
mova m6, [pd_64] |
||||
mova m7, [filteryq+ 0] |
||||
%if ARCH_X86_64 && mmsize > 8 |
||||
mova m8, [filteryq+32] |
||||
mova m9, [filteryq+64] |
||||
mova m10, [filteryq+96] |
||||
%endif |
||||
.loop: |
||||
movu m0, [srcq-6] |
||||
movu m1, [srcq-4] |
||||
movu m2, [srcq-2] |
||||
movu m3, [srcq+0] |
||||
movu m4, [srcq+2] |
||||
pmaddwd m0, m7 |
||||
pmaddwd m1, m7 |
||||
%if ARCH_X86_64 && mmsize > 8 |
||||
pmaddwd m2, m8 |
||||
pmaddwd m3, m8 |
||||
pmaddwd m4, m9 |
||||
%else |
||||
pmaddwd m2, [filteryq+32] |
||||
pmaddwd m3, [filteryq+32] |
||||
pmaddwd m4, [filteryq+64] |
||||
%endif |
||||
paddd m0, m2 |
||||
paddd m1, m3 |
||||
paddd m0, m4 |
||||
movu m2, [srcq+4] |
||||
movu m3, [srcq+6] |
||||
movu m4, [srcq+8] |
||||
add srcq, sstrideq |
||||
%if ARCH_X86_64 && mmsize > 8 |
||||
pmaddwd m2, m9 |
||||
pmaddwd m3, m10 |
||||
pmaddwd m4, m10 |
||||
%else |
||||
pmaddwd m2, [filteryq+64] |
||||
pmaddwd m3, [filteryq+96] |
||||
pmaddwd m4, [filteryq+96] |
||||
%endif |
||||
paddd m1, m2 |
||||
paddd m0, m3 |
||||
paddd m1, m4 |
||||
paddd m0, m6 |
||||
paddd m1, m6 |
||||
psrad m0, 7 |
||||
psrad m1, 7 |
||||
%if cpuflag(sse4) |
||||
packusdw m0, m0 |
||||
packusdw m1, m1 |
||||
%else |
||||
packssdw m0, m0 |
||||
packssdw m1, m1 |
||||
%endif |
||||
punpcklwd m0, m1 |
||||
pminsw m0, m5 |
||||
%if notcpuflag(sse4) |
||||
%if ARCH_X86_64 |
||||
pmaxsw m0, m11 |
||||
%else |
||||
pxor m2, m2 |
||||
pmaxsw m0, m2 |
||||
%endif |
||||
%endif |
||||
%ifidn %1, avg |
||||
pavgw m0, [dstq] |
||||
%endif |
||||
mova [dstq], m0 |
||||
add dstq, dstrideq |
||||
dec hd |
||||
jg .loop |
||||
RET |
||||
|
||||
cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _12, 6, 6, %2, dst, dstride, src, sstride, h, filtery |
||||
mova m5, [pw_4095] |
||||
jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_ %+ %%px %+ _10 %+ SUFFIX).body |
||||
%endmacro |
||||
|
||||
INIT_XMM sse2 |
||||
filter_h_fn put |
||||
filter_h_fn avg |
||||
|
||||
%macro filter_v4_fn 1-2 12 |
||||
%if ARCH_X86_64 |
||||
cglobal vp9_%1_8tap_1d_v_4_10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3 |
||||
%else |
||||
cglobal vp9_%1_8tap_1d_v_4_10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3 |
||||
mov filteryq, r5mp |
||||
%define hd r4mp |
||||
%endif |
||||
mova m5, [pw_1023] |
||||
.body: |
||||
%if notcpuflag(sse4) && ARCH_X86_64 |
||||
pxor m11, m11 |
||||
%endif |
||||
mova m6, [pd_64] |
||||
lea sstride3q, [sstrideq*3] |
||||
lea src4q, [srcq+sstrideq] |
||||
sub srcq, sstride3q |
||||
mova m7, [filteryq+ 0] |
||||
%if ARCH_X86_64 && mmsize > 8 |
||||
mova m8, [filteryq+ 32] |
||||
mova m9, [filteryq+ 64] |
||||
mova m10, [filteryq+ 96] |
||||
%endif |
||||
.loop: |
||||
; FIXME maybe reuse loads from previous rows, or just |
||||
; more generally unroll this to prevent multiple loads of |
||||
; the same data? |
||||
movh m0, [srcq] |
||||
movh m1, [srcq+sstrideq] |
||||
movh m2, [srcq+sstrideq*2] |
||||
movh m3, [srcq+sstride3q] |
||||
add srcq, sstrideq |
||||
movh m4, [src4q] |
||||
punpcklwd m0, m1 |
||||
punpcklwd m2, m3 |
||||
pmaddwd m0, m7 |
||||
%if ARCH_X86_64 && mmsize > 8 |
||||
pmaddwd m2, m8 |
||||
%else |
||||
pmaddwd m2, [filteryq+ 32] |
||||
%endif |
||||
movh m1, [src4q+sstrideq] |
||||
movh m3, [src4q+sstrideq*2] |
||||
paddd m0, m2 |
||||
movh m2, [src4q+sstride3q] |
||||
add src4q, sstrideq |
||||
punpcklwd m4, m1 |
||||
punpcklwd m3, m2 |
||||
%if ARCH_X86_64 && mmsize > 8 |
||||
pmaddwd m4, m9 |
||||
pmaddwd m3, m10 |
||||
%else |
||||
pmaddwd m4, [filteryq+ 64] |
||||
pmaddwd m3, [filteryq+ 96] |
||||
%endif |
||||
paddd m0, m4 |
||||
paddd m0, m3 |
||||
paddd m0, m6 |
||||
psrad m0, 7 |
||||
%if cpuflag(sse4) |
||||
packusdw m0, m0 |
||||
%else |
||||
packssdw m0, m0 |
||||
%endif |
||||
%ifidn %1, avg |
||||
movh m1, [dstq] |
||||
%endif |
||||
pminsw m0, m5 |
||||
%if notcpuflag(sse4) |
||||
%if ARCH_X86_64 |
||||
pmaxsw m0, m11 |
||||
%else |
||||
pxor m2, m2 |
||||
pmaxsw m0, m2 |
||||
%endif |
||||
%endif |
||||
%ifidn %1, avg |
||||
pavgw m0, m1 |
||||
%endif |
||||
movh [dstq], m0 |
||||
add dstq, dstrideq |
||||
dec hd |
||||
jg .loop |
||||
RET |
||||
|
||||
%if ARCH_X86_64 |
||||
cglobal vp9_%1_8tap_1d_v_4_12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3 |
||||
%else |
||||
cglobal vp9_%1_8tap_1d_v_4_12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3 |
||||
mov filteryq, r5mp |
||||
%endif |
||||
mova m5, [pw_4095] |
||||
jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_4_10 %+ SUFFIX).body |
||||
%endmacro |
||||
|
||||
INIT_XMM sse2 |
||||
filter_v4_fn put |
||||
filter_v4_fn avg |
||||
|
||||
%macro filter_v_fn 1-2 13 |
||||
%assign %%px mmsize/2 |
||||
%if ARCH_X86_64 |
||||
cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3 |
||||
%else |
||||
cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3 |
||||
mov filteryq, r5mp |
||||
%define hd r4mp |
||||
%endif |
||||
mova m5, [pw_1023] |
||||
.body: |
||||
%if notcpuflag(sse4) && ARCH_X86_64 |
||||
pxor m12, m12 |
||||
%endif |
||||
%if ARCH_X86_64 |
||||
mova m11, [pd_64] |
||||
%endif |
||||
lea sstride3q, [sstrideq*3] |
||||
lea src4q, [srcq+sstrideq] |
||||
sub srcq, sstride3q |
||||
mova m7, [filteryq+ 0] |
||||
%if ARCH_X86_64 && mmsize > 8 |
||||
mova m8, [filteryq+ 32] |
||||
mova m9, [filteryq+ 64] |
||||
mova m10, [filteryq+ 96] |
||||
%endif |
||||
.loop: |
||||
; FIXME maybe reuse loads from previous rows, or just |
||||
; more generally unroll this to prevent multiple loads of |
||||
; the same data? |
||||
movu m0, [srcq] |
||||
movu m1, [srcq+sstrideq] |
||||
movu m2, [srcq+sstrideq*2] |
||||
movu m3, [srcq+sstride3q] |
||||
add srcq, sstrideq |
||||
movu m4, [src4q] |
||||
SBUTTERFLY wd, 0, 1, 6 |
||||
SBUTTERFLY wd, 2, 3, 6 |
||||
pmaddwd m0, m7 |
||||
pmaddwd m1, m7 |
||||
%if ARCH_X86_64 && mmsize > 8 |
||||
pmaddwd m2, m8 |
||||
pmaddwd m3, m8 |
||||
%else |
||||
pmaddwd m2, [filteryq+ 32] |
||||
pmaddwd m3, [filteryq+ 32] |
||||
%endif |
||||
paddd m0, m2 |
||||
paddd m1, m3 |
||||
movu m2, [src4q+sstrideq] |
||||
movu m3, [src4q+sstrideq*2] |
||||
SBUTTERFLY wd, 4, 2, 6 |
||||
%if ARCH_X86_64 && mmsize > 8 |
||||
pmaddwd m4, m9 |
||||
pmaddwd m2, m9 |
||||
%else |
||||
pmaddwd m4, [filteryq+ 64] |
||||
pmaddwd m2, [filteryq+ 64] |
||||
%endif |
||||
paddd m0, m4 |
||||
paddd m1, m2 |
||||
movu m4, [src4q+sstride3q] |
||||
add src4q, sstrideq |
||||
SBUTTERFLY wd, 3, 4, 6 |
||||
%if ARCH_X86_64 && mmsize > 8 |
||||
pmaddwd m3, m10 |
||||
pmaddwd m4, m10 |
||||
%else |
||||
pmaddwd m3, [filteryq+ 96] |
||||
pmaddwd m4, [filteryq+ 96] |
||||
%endif |
||||
paddd m0, m3 |
||||
paddd m1, m4 |
||||
%if ARCH_X86_64 |
||||
paddd m0, m11 |
||||
paddd m1, m11 |
||||
%else |
||||
paddd m0, [pd_64] |
||||
paddd m1, [pd_64] |
||||
%endif |
||||
psrad m0, 7 |
||||
psrad m1, 7 |
||||
%if cpuflag(sse4) |
||||
packusdw m0, m1 |
||||
%else |
||||
packssdw m0, m1 |
||||
%endif |
||||
pminsw m0, m5 |
||||
%if notcpuflag(sse4) |
||||
%if ARCH_X86_64 |
||||
pmaxsw m0, m12 |
||||
%else |
||||
pxor m2, m2 |
||||
pmaxsw m0, m2 |
||||
%endif |
||||
%endif |
||||
%ifidn %1, avg |
||||
pavgw m0, [dstq] |
||||
%endif |
||||
mova [dstq], m0 |
||||
add dstq, dstrideq |
||||
dec hd |
||||
jg .loop |
||||
RET |
||||
|
||||
%if ARCH_X86_64 |
||||
cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3 |
||||
%else |
||||
cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3 |
||||
mov filteryq, r5mp |
||||
%endif |
||||
mova m5, [pw_4095] |
||||
jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_ %+ %%px %+ _10 %+ SUFFIX).body |
||||
%endmacro |
||||
|
||||
INIT_XMM sse2 |
||||
filter_v_fn put |
||||
filter_v_fn avg |
Loading…
Reference in new issue