mirror of https://github.com/FFmpeg/FFmpeg.git
Tested using a simple command (apply edge enhance): ./ffmpeg_g -i ~/Downloads/bbb_sunflower_1080p_30fps_normal.mp4 \ -vf convolution="0 0 0 -1 1 0 0 0 0:0 0 0 -1 1 0 0 0 0:0 0 0 -1 1 0 0 0 0:0 0 0 -1 1 0 0 0 0:5:1:1:1:0:128:128:128" \ -an -vframes 1000 -f null /dev/null The fps increase from 151 to 270 on my local machine. Signed-off-by: Ruiling Song <ruiling.song@intel.com>pull/344/head
parent
6c67c8ca9a
commit
98e419cbf5
5 changed files with 271 additions and 38 deletions
@ -0,0 +1,64 @@ |
||||
/*
|
||||
* Copyright (c) 2012-2013 Oka Motofumi (chikuzen.mo at gmail dot com) |
||||
* Copyright (c) 2015 Paul B Mahol |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
#ifndef AVFILTER_CONVOLUTION_H |
||||
#define AVFILTER_CONVOLUTION_H |
||||
#include "avfilter.h" |
||||
|
||||
enum MatrixMode { |
||||
MATRIX_SQUARE, |
||||
MATRIX_ROW, |
||||
MATRIX_COLUMN, |
||||
MATRIX_NBMODES, |
||||
}; |
||||
|
||||
typedef struct ConvolutionContext { |
||||
const AVClass *class; |
||||
|
||||
char *matrix_str[4]; |
||||
float rdiv[4]; |
||||
float bias[4]; |
||||
int mode[4]; |
||||
float scale; |
||||
float delta; |
||||
int planes; |
||||
|
||||
int size[4]; |
||||
int depth; |
||||
int max; |
||||
int bpc; |
||||
int nb_planes; |
||||
int nb_threads; |
||||
int planewidth[4]; |
||||
int planeheight[4]; |
||||
int matrix[4][49]; |
||||
int matrix_length[4]; |
||||
int copy[4]; |
||||
|
||||
void (*setup[4])(int radius, const uint8_t *c[], const uint8_t *src, int stride, |
||||
int x, int width, int y, int height, int bpc); |
||||
void (*filter[4])(uint8_t *dst, int width, |
||||
float rdiv, float bias, const int *const matrix, |
||||
const uint8_t *c[], int peak, int radius, |
||||
int dstride, int stride); |
||||
} ConvolutionContext; |
||||
|
||||
void ff_convolution_init_x86(ConvolutionContext *s); |
||||
#endif |
@ -0,0 +1,156 @@ |
||||
;***************************************************************************** |
||||
;* x86-optimized functions for convolution filter |
||||
;* |
||||
;* This file is part of FFmpeg. |
||||
;* |
||||
;* FFmpeg is free software; you can redistribute it and/or |
||||
;* modify it under the terms of the GNU Lesser General Public |
||||
;* License as published by the Free Software Foundation; either |
||||
;* version 2.1 of the License, or (at your option) any later version. |
||||
;* |
||||
;* FFmpeg is distributed in the hope that it will be useful, |
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
;* Lesser General Public License for more details. |
||||
;* |
||||
;* You should have received a copy of the GNU Lesser General Public |
||||
;* License along with FFmpeg; if not, write to the Free Software |
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
;****************************************************************************** |
||||
|
||||
%include "libavutil/x86/x86util.asm" |
||||
|
||||
SECTION_RODATA |
||||
half: dd 0.5 |
||||
|
||||
SECTION .text |
||||
|
||||
; void filter_3x3_sse4(uint8_t *dst, int width, |
||||
; float rdiv, float bias, const int *const matrix, |
||||
; const uint8_t *c[], int peak, int radius, |
||||
; int dstride, int stride) |
||||
|
||||
|
||||
%macro PROCESS_V 1 |
||||
movss m2, [matrixq + 4 * %1] |
||||
VBROADCASTSS m2, m2 |
||||
movss m3, [c%1q + xq] |
||||
punpcklbw m3, m6 |
||||
punpcklwd m3, m6 |
||||
pmulld m2, m3 |
||||
paddd m4, m2 |
||||
%endmacro |
||||
|
||||
%macro PROCESS_S 1 |
||||
movzx ptrd, byte [c%1q + xq] |
||||
imul ptrd, [matrixq + 4 * %1] |
||||
add rd, ptrd |
||||
%endmacro |
||||
|
||||
%macro FILTER_3X3 0 |
||||
%if UNIX64 |
||||
cglobal filter_3x3, 4, 15, 7, dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x |
||||
%else |
||||
cglobal filter_3x3, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x |
||||
%endif |
||||
|
||||
%if WIN64 |
||||
SWAP m0, m2 |
||||
SWAP m1, m3 |
||||
mov r2q, matrixmp |
||||
mov r3q, ptrmp |
||||
DEFINE_ARGS dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x |
||||
%endif |
||||
movsxdifnidn widthq, widthd |
||||
VBROADCASTSS m0, m0 |
||||
VBROADCASTSS m1, m1 |
||||
pxor m6, m6 |
||||
movss m5, [half] |
||||
VBROADCASTSS m5, m5 |
||||
mov c0q, [ptrq + 0*gprsize] |
||||
mov c1q, [ptrq + 1*gprsize] |
||||
mov c2q, [ptrq + 2*gprsize] |
||||
mov c3q, [ptrq + 3*gprsize] |
||||
mov c4q, [ptrq + 4*gprsize] |
||||
mov c5q, [ptrq + 5*gprsize] |
||||
mov c6q, [ptrq + 6*gprsize] |
||||
mov c7q, [ptrq + 7*gprsize] |
||||
mov c8q, [ptrq + 8*gprsize] |
||||
|
||||
xor xq, xq |
||||
cmp widthq, mmsize/4 |
||||
jl .loop2 |
||||
|
||||
mov rq, widthq |
||||
and rq, mmsize/4-1 |
||||
sub widthq, rq |
||||
|
||||
.loop1: |
||||
pxor m4, m4 ; sum = 0; |
||||
|
||||
PROCESS_V 0 |
||||
PROCESS_V 1 |
||||
PROCESS_V 2 |
||||
PROCESS_V 3 |
||||
PROCESS_V 4 |
||||
PROCESS_V 5 |
||||
PROCESS_V 6 |
||||
PROCESS_V 7 |
||||
PROCESS_V 8 |
||||
|
||||
cvtdq2ps m4, m4 |
||||
mulps m4, m0 ; sum *= rdiv |
||||
addps m4, m1 ; sum += bias |
||||
addps m4, m5 ; sum += 0.5 |
||||
cvttps2dq m4, m4 |
||||
packssdw m4, m4 |
||||
packuswb m4, m4 |
||||
movss [dstq + xq], m4 |
||||
|
||||
add xq, mmsize/4 |
||||
cmp xq, widthq |
||||
jl .loop1 |
||||
|
||||
add widthq, rq |
||||
cmp xq, widthq |
||||
jge .end |
||||
|
||||
.loop2: |
||||
; reuse r to hold sum, init with zero |
||||
xor rd, rd |
||||
|
||||
PROCESS_S 0 |
||||
PROCESS_S 1 |
||||
PROCESS_S 2 |
||||
PROCESS_S 3 |
||||
PROCESS_S 4 |
||||
PROCESS_S 5 |
||||
PROCESS_S 6 |
||||
PROCESS_S 7 |
||||
PROCESS_S 8 |
||||
|
||||
pxor m4, m4 |
||||
cvtsi2ss m4, rd |
||||
mulss m4, m0 ; sum *= rdiv |
||||
addss m4, m1 ; sum += bias |
||||
addss m4, m5 ; sum += 0.5 |
||||
; we don't have simple scalar instructions to convert |
||||
; from 32bit to 8bit with saturation, so here |
||||
; just use packed version SSE instructions for simplicity. |
||||
cvttps2dq m4, m4 ; trunc to integer |
||||
packssdw m4, m4 |
||||
packuswb m4, m4 |
||||
movd rd, m4 |
||||
mov [dstq + xq], rb |
||||
|
||||
add xq, 1 |
||||
cmp xq, widthq |
||||
jl .loop2 |
||||
.end: |
||||
RET |
||||
%endmacro |
||||
|
||||
%if ARCH_X86_64 |
||||
INIT_XMM sse4 |
||||
FILTER_3X3 |
||||
%endif |
@ -0,0 +1,46 @@ |
||||
/*
|
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#include "config.h" |
||||
|
||||
#include "libavutil/attributes.h" |
||||
#include "libavutil/cpu.h" |
||||
#include "libavutil/x86/cpu.h" |
||||
#include "libavfilter/convolution.h" |
||||
|
||||
void ff_filter_3x3_sse4(uint8_t *dst, int width, |
||||
float rdiv, float bias, const int *const matrix, |
||||
const uint8_t *c[], int peak, int radius, |
||||
int dstride, int stride); |
||||
|
||||
av_cold void ff_convolution_init_x86(ConvolutionContext *s) |
||||
{ |
||||
#if ARCH_X86_64 |
||||
int i; |
||||
int cpu_flags = av_get_cpu_flags(); |
||||
for (i = 0; i < 4; i++) { |
||||
if (s->mode[i] == MATRIX_SQUARE) { |
||||
if (s->matrix_length[i] == 9) { |
||||
if (EXTERNAL_SSE4(cpu_flags)) |
||||
s->filter[i] = ff_filter_3x3_sse4; |
||||
} |
||||
} |
||||
} |
||||
#endif |
||||
} |
Loading…
Reference in new issue