mirror of https://github.com/FFmpeg/FFmpeg.git
Overall speed changes for 1920x1080, yuv422p10le, 60fps from: 0.19x to 0.343xpull/350/head
parent
b9ea493afe
commit
389cc142fb
8 changed files with 985 additions and 274 deletions
@ -0,0 +1,118 @@ |
||||
/*
|
||||
* Copyright (c) 2015-2016 Kieran Kunhya <kieran@kunhya.com> |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#include "libavutil/attributes.h" |
||||
#include "libavutil/common.h" |
||||
#include "libavutil/avassert.h" |
||||
|
||||
#include "cfhddsp.h" |
||||
|
||||
static av_always_inline void filter(int16_t *output, ptrdiff_t out_stride, |
||||
const int16_t *low, ptrdiff_t low_stride, |
||||
const int16_t *high, ptrdiff_t high_stride, |
||||
int len, int clip) |
||||
{ |
||||
int16_t tmp; |
||||
int i; |
||||
|
||||
tmp = (11*low[0*low_stride] - 4*low[1*low_stride] + low[2*low_stride] + 4) >> 3; |
||||
output[(2*0+0)*out_stride] = (tmp + high[0*high_stride]) >> 1; |
||||
if (clip) |
||||
output[(2*0+0)*out_stride] = av_clip_uintp2_c(output[(2*0+0)*out_stride], clip); |
||||
|
||||
tmp = ( 5*low[0*low_stride] + 4*low[1*low_stride] - low[2*low_stride] + 4) >> 3; |
||||
output[(2*0+1)*out_stride] = (tmp - high[0*high_stride]) >> 1; |
||||
if (clip) |
||||
output[(2*0+1)*out_stride] = av_clip_uintp2_c(output[(2*0+1)*out_stride], clip); |
||||
|
||||
for (i = 1; i < len - 1; i++) { |
||||
tmp = (low[(i-1)*low_stride] - low[(i+1)*low_stride] + 4) >> 3; |
||||
output[(2*i+0)*out_stride] = (tmp + low[i*low_stride] + high[i*high_stride]) >> 1; |
||||
if (clip) |
||||
output[(2*i+0)*out_stride] = av_clip_uintp2_c(output[(2*i+0)*out_stride], clip); |
||||
|
||||
tmp = (low[(i+1)*low_stride] - low[(i-1)*low_stride] + 4) >> 3; |
||||
output[(2*i+1)*out_stride] = (tmp + low[i*low_stride] - high[i*high_stride]) >> 1; |
||||
if (clip) |
||||
output[(2*i+1)*out_stride] = av_clip_uintp2_c(output[(2*i+1)*out_stride], clip); |
||||
} |
||||
|
||||
tmp = ( 5*low[i*low_stride] + 4*low[(i-1)*low_stride] - low[(i-2)*low_stride] + 4) >> 3; |
||||
output[(2*i+0)*out_stride] = (tmp + high[i*high_stride]) >> 1; |
||||
if (clip) |
||||
output[(2*i+0)*out_stride] = av_clip_uintp2_c(output[(2*i+0)*out_stride], clip); |
||||
|
||||
tmp = (11*low[i*low_stride] - 4*low[(i-1)*low_stride] + low[(i-2)*low_stride] + 4) >> 3; |
||||
output[(2*i+1)*out_stride] = (tmp - high[i*high_stride]) >> 1; |
||||
if (clip) |
||||
output[(2*i+1)*out_stride] = av_clip_uintp2_c(output[(2*i+1)*out_stride], clip); |
||||
} |
||||
|
||||
static void vert_filter(int16_t *output, ptrdiff_t out_stride, |
||||
const int16_t *low, ptrdiff_t low_stride, |
||||
const int16_t *high, ptrdiff_t high_stride, |
||||
int width, int height) |
||||
{ |
||||
for (int i = 0; i < width; i++) { |
||||
filter(output, out_stride, low, low_stride, high, high_stride, height, 0); |
||||
low++; |
||||
high++; |
||||
output++; |
||||
} |
||||
} |
||||
|
||||
static void horiz_filter(int16_t *output, ptrdiff_t ostride, |
||||
const int16_t *low, ptrdiff_t lstride, |
||||
const int16_t *high, ptrdiff_t hstride, |
||||
int width, int height) |
||||
{ |
||||
for (int i = 0; i < height; i++) { |
||||
filter(output, 1, low, 1, high, 1, width, 0); |
||||
low += lstride; |
||||
high += hstride; |
||||
output += ostride * 2; |
||||
} |
||||
} |
||||
|
||||
static void horiz_filter_clip(int16_t *output, const int16_t *low, const int16_t *high, |
||||
int width, int clip) |
||||
{ |
||||
filter(output, 1, low, 1, high, 1, width, clip); |
||||
} |
||||
|
||||
static void horiz_filter_clip_bayer(int16_t *output, const int16_t *low, const int16_t *high, |
||||
int width, int clip) |
||||
{ |
||||
filter(output, 2, low, 1, high, 1, width, clip); |
||||
} |
||||
|
||||
av_cold void ff_cfhddsp_init(CFHDDSPContext *c, int depth, int bayer) |
||||
{ |
||||
c->horiz_filter = horiz_filter; |
||||
c->vert_filter = vert_filter; |
||||
|
||||
if (bayer) |
||||
c->horiz_filter_clip = horiz_filter_clip_bayer; |
||||
else |
||||
c->horiz_filter_clip = horiz_filter_clip; |
||||
|
||||
if (ARCH_X86) |
||||
ff_cfhddsp_init_x86(c, depth, bayer); |
||||
} |
@ -0,0 +1,44 @@ |
||||
/*
|
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#ifndef AVCODEC_CFHDDSP_H |
||||
#define AVCODEC_CFHDDSP_H |
||||
|
||||
#include <stddef.h> |
||||
#include <stdint.h> |
||||
|
||||
typedef struct CFHDDSPContext { |
||||
void (*horiz_filter)(int16_t *output, ptrdiff_t out_stride, |
||||
const int16_t *low, ptrdiff_t low_stride, |
||||
const int16_t *high, ptrdiff_t high_stride, |
||||
int width, int height); |
||||
|
||||
void (*vert_filter)(int16_t *output, ptrdiff_t out_stride, |
||||
const int16_t *low, ptrdiff_t low_stride, |
||||
const int16_t *high, ptrdiff_t high_stride, |
||||
int width, int height); |
||||
|
||||
void (*horiz_filter_clip)(int16_t *output, const int16_t *low, const int16_t *high, |
||||
int width, int bpc); |
||||
} CFHDDSPContext; |
||||
|
||||
void ff_cfhddsp_init(CFHDDSPContext *c, int format, int bayer); |
||||
|
||||
void ff_cfhddsp_init_x86(CFHDDSPContext *c, int format, int bayer); |
||||
|
||||
#endif /* AVCODEC_CFHDDSP_H */ |
@ -0,0 +1,701 @@ |
||||
;****************************************************************************** |
||||
;* x86-optimized functions for the CFHD decoder |
||||
;* Copyright (c) 2020 Paul B Mahol |
||||
;* |
||||
;* This file is part of FFmpeg. |
||||
;* |
||||
;* FFmpeg is free software; you can redistribute it and/or |
||||
;* modify it under the terms of the GNU Lesser General Public |
||||
;* License as published by the Free Software Foundation; either |
||||
;* version 2.1 of the License, or (at your option) any later version. |
||||
;* |
||||
;* FFmpeg is distributed in the hope that it will be useful, |
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
;* Lesser General Public License for more details. |
||||
;* |
||||
;* You should have received a copy of the GNU Lesser General Public |
||||
;* License along with FFmpeg; if not, write to the Free Software |
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
;****************************************************************************** |
||||
|
||||
%include "libavutil/x86/x86util.asm" |
||||
|
||||
SECTION_RODATA |
||||
|
||||
factor_p1_n1: dw 1, -1, 1, -1, 1, -1, 1, -1, |
||||
factor_n1_p1: dw -1, 1, -1, 1, -1, 1, -1, 1, |
||||
factor_p11_n4: dw 11, -4, 11, -4, 11, -4, 11, -4, |
||||
factor_p5_p4: dw 5, 4, 5, 4, 5, 4, 5, 4, |
||||
pd_4: times 4 dd 4 |
||||
pw_1: times 8 dw 1 |
||||
pw_0: times 8 dw 0 |
||||
pw_1023: times 8 dw 1023 |
||||
pw_4095: times 8 dw 4095 |
||||
|
||||
SECTION .text |
||||
|
||||
%macro CFHD_HORIZ_FILTER 1 |
||||
%if %1 == 1023 |
||||
cglobal cfhd_horiz_filter_clip10, 5, 6, 8 + 4 * ARCH_X86_64, output, low, high, width, bpc |
||||
DEFINE_ARGS output, low, high, width, x, temp |
||||
shl widthd, 1 |
||||
%define ostrideq widthq |
||||
%define lwidthq widthq |
||||
%define hwidthq widthq |
||||
%elif %1 == 4095 |
||||
cglobal cfhd_horiz_filter_clip12, 5, 6, 8 + 4 * ARCH_X86_64, output, low, high, width, bpc |
||||
DEFINE_ARGS output, low, high, width, x, temp |
||||
shl widthd, 1 |
||||
%define ostrideq widthq |
||||
%define lwidthq widthq |
||||
%define hwidthq widthq |
||||
%else |
||||
%if ARCH_X86_64 |
||||
cglobal cfhd_horiz_filter, 11, 11, 12, output, ostride, low, lwidth, high, hwidth, width, height |
||||
DEFINE_ARGS output, ostride, low, lwidth, high, hwidth, width, height, x, y, temp |
||||
shl ostrided, 1 |
||||
shl lwidthd, 1 |
||||
shl hwidthd, 1 |
||||
shl widthd, 1 |
||||
|
||||
mov yq, heightq |
||||
neg yq |
||||
%else |
||||
cglobal cfhd_horiz_filter, 7, 7, 8, output, x, low, y, high, temp, width, height |
||||
shl xd, 1 |
||||
shl yd, 1 |
||||
shl tempd, 1 |
||||
shl widthd, 1 |
||||
|
||||
mov xmp, xq |
||||
mov ymp, yq |
||||
mov tempmp, tempq |
||||
|
||||
mov yd, r7m |
||||
neg yq |
||||
|
||||
%define ostrideq xm |
||||
%define lwidthq ym |
||||
%define hwidthq tempm |
||||
%endif |
||||
%endif |
||||
|
||||
%if ARCH_X86_64 |
||||
mova m8, [factor_p1_n1] |
||||
mova m9, [factor_n1_p1] |
||||
mova m10, [pw_1] |
||||
mova m11, [pd_4] |
||||
%endif |
||||
|
||||
%if %1 == 0 |
||||
.looph: |
||||
%endif |
||||
movsx xq, word [lowq] |
||||
imul xq, 11 |
||||
|
||||
movsx tempq, word [lowq + 2] |
||||
imul tempq, -4 |
||||
add tempq, xq |
||||
|
||||
movsx xq, word [lowq + 4] |
||||
add tempq, xq |
||||
add tempq, 4 |
||||
sar tempq, 3 |
||||
|
||||
movsx xq, word [highq] |
||||
add tempq, xq |
||||
sar tempq, 1 |
||||
|
||||
%if %1 |
||||
movd xm0, tempd |
||||
CLIPW m0, [pw_0], [pw_%1] |
||||
pextrw tempd, xm0, 0 |
||||
%endif |
||||
mov word [outputq], tempw |
||||
|
||||
movsx xq, word [lowq] |
||||
imul xq, 5 |
||||
|
||||
movsx tempq, word [lowq + 2] |
||||
imul tempq, 4 |
||||
add tempq, xq |
||||
|
||||
movsx xq, word [lowq + 4] |
||||
sub tempq, xq |
||||
add tempq, 4 |
||||
sar tempq, 3 |
||||
|
||||
movsx xq, word [highq] |
||||
sub tempq, xq |
||||
sar tempq, 1 |
||||
|
||||
%if %1 |
||||
movd xm0, tempd |
||||
CLIPW m0, [pw_0], [pw_%1] |
||||
pextrw tempd, xm0, 0 |
||||
%endif |
||||
mov word [outputq + 2], tempw |
||||
|
||||
mov xq, 0 |
||||
|
||||
.loop: |
||||
movu m4, [lowq + xq] |
||||
movu m1, [lowq + xq + 4] |
||||
|
||||
mova m5, m4 |
||||
punpcklwd m4, m1 |
||||
punpckhwd m5, m1 |
||||
|
||||
mova m6, m4 |
||||
mova m7, m5 |
||||
|
||||
%if ARCH_X86_64 |
||||
pmaddwd m4, m8 |
||||
pmaddwd m5, m8 |
||||
pmaddwd m6, m9 |
||||
pmaddwd m7, m9 |
||||
|
||||
paddd m4, m11 |
||||
paddd m5, m11 |
||||
paddd m6, m11 |
||||
paddd m7, m11 |
||||
%else |
||||
pmaddwd m4, [factor_p1_n1] |
||||
pmaddwd m5, [factor_p1_n1] |
||||
pmaddwd m6, [factor_n1_p1] |
||||
pmaddwd m7, [factor_n1_p1] |
||||
|
||||
paddd m4, [pd_4] |
||||
paddd m5, [pd_4] |
||||
paddd m6, [pd_4] |
||||
paddd m7, [pd_4] |
||||
%endif |
||||
|
||||
psrad m4, 3 |
||||
psrad m5, 3 |
||||
psrad m6, 3 |
||||
psrad m7, 3 |
||||
|
||||
movu m2, [lowq + xq + 2] |
||||
movu m3, [highq + xq + 2] |
||||
|
||||
mova m0, m2 |
||||
punpcklwd m2, m3 |
||||
punpckhwd m0, m3 |
||||
|
||||
mova m1, m2 |
||||
mova m3, m0 |
||||
|
||||
%if ARCH_X86_64 |
||||
pmaddwd m2, m10 |
||||
pmaddwd m0, m10 |
||||
pmaddwd m1, m8 |
||||
pmaddwd m3, m8 |
||||
%else |
||||
pmaddwd m2, [pw_1] |
||||
pmaddwd m0, [pw_1] |
||||
pmaddwd m1, [factor_p1_n1] |
||||
pmaddwd m3, [factor_p1_n1] |
||||
%endif |
||||
|
||||
paddd m2, m4 |
||||
paddd m0, m5 |
||||
paddd m1, m6 |
||||
paddd m3, m7 |
||||
|
||||
psrad m2, 1 |
||||
psrad m0, 1 |
||||
psrad m1, 1 |
||||
psrad m3, 1 |
||||
|
||||
packssdw m2, m0 |
||||
packssdw m1, m3 |
||||
|
||||
mova m0, m2 |
||||
punpcklwd m2, m1 |
||||
punpckhwd m0, m1 |
||||
|
||||
%if %1 |
||||
CLIPW m2, [pw_0], [pw_%1] |
||||
CLIPW m0, [pw_0], [pw_%1] |
||||
%endif |
||||
|
||||
movu [outputq + xq * 2 + 4], m2 |
||||
movu [outputq + xq * 2 + mmsize + 4], m0 |
||||
|
||||
add xq, mmsize |
||||
cmp xq, widthq |
||||
jl .loop |
||||
|
||||
add lowq, widthq |
||||
add highq, widthq |
||||
add outputq, widthq |
||||
add outputq, widthq |
||||
|
||||
movsx xq, word [lowq - 2] |
||||
imul xq, 5 |
||||
|
||||
movsx tempq, word [lowq - 4] |
||||
imul tempq, 4 |
||||
add tempq, xq |
||||
|
||||
movsx xq, word [lowq - 6] |
||||
sub tempq, xq |
||||
add tempq, 4 |
||||
sar tempq, 3 |
||||
|
||||
movsx xq, word [highq - 2] |
||||
add tempq, xq |
||||
sar tempq, 1 |
||||
|
||||
%if %1 |
||||
movd xm0, tempd |
||||
CLIPW m0, [pw_0], [pw_%1] |
||||
pextrw tempd, xm0, 0 |
||||
%endif |
||||
mov word [outputq - 4], tempw |
||||
|
||||
movsx xq, word [lowq - 2] |
||||
imul xq, 11 |
||||
|
||||
movsx tempq, word [lowq - 4] |
||||
imul tempq, -4 |
||||
add tempq, xq |
||||
|
||||
movsx xq, word [lowq - 6] |
||||
add tempq, xq |
||||
add tempq, 4 |
||||
sar tempq, 3 |
||||
|
||||
movsx xq, word [highq - 2] |
||||
sub tempq, xq |
||||
sar tempq, 1 |
||||
|
||||
%if %1 |
||||
movd xm0, tempd |
||||
CLIPW m0, [pw_0], [pw_%1] |
||||
pextrw tempd, xm0, 0 |
||||
%endif |
||||
mov word [outputq - 2], tempw |
||||
|
||||
%if %1 == 0 |
||||
sub lowq, widthq |
||||
sub highq, widthq |
||||
sub outputq, widthq |
||||
sub outputq, widthq |
||||
|
||||
add lowq, lwidthq |
||||
add highq, hwidthq |
||||
add outputq, ostrideq |
||||
add outputq, ostrideq |
||||
add yq, 1 |
||||
jl .looph |
||||
%endif |
||||
|
||||
RET |
||||
%endmacro |
||||
|
||||
INIT_XMM sse2 |
||||
CFHD_HORIZ_FILTER 0 |
||||
|
||||
INIT_XMM sse2 |
||||
CFHD_HORIZ_FILTER 1023 |
||||
|
||||
INIT_XMM sse2 |
||||
CFHD_HORIZ_FILTER 4095 |
||||
|
||||
INIT_XMM sse2 |
||||
%if ARCH_X86_64 |
||||
cglobal cfhd_vert_filter, 11, 11, 14, output, ostride, low, lwidth, high, hwidth, width, height |
||||
DEFINE_ARGS output, ostride, low, lwidth, high, hwidth, width, height, x, y, pos |
||||
shl ostrided, 1 |
||||
shl lwidthd, 1 |
||||
shl hwidthd, 1 |
||||
shl widthd, 1 |
||||
|
||||
dec heightq |
||||
|
||||
mova m8, [factor_p1_n1] |
||||
mova m9, [factor_n1_p1] |
||||
mova m10, [pw_1] |
||||
mova m11, [pd_4] |
||||
mova m12, [factor_p11_n4] |
||||
mova m13, [factor_p5_p4] |
||||
%else |
||||
cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height |
||||
shl xd, 1 |
||||
shl yd, 1 |
||||
shl posd, 1 |
||||
shl widthd, 1 |
||||
|
||||
mov xmp, xq |
||||
mov ymp, yq |
||||
mov posmp, posq |
||||
|
||||
mov xq, r7m |
||||
dec xq |
||||
mov widthmp, xq |
||||
|
||||
%define ostrideq xm |
||||
%define lwidthq ym |
||||
%define hwidthq posm |
||||
%define heightq widthm |
||||
|
||||
%endif |
||||
|
||||
xor xq, xq |
||||
.loopw: |
||||
xor yq, yq |
||||
|
||||
mov posq, xq |
||||
movu m0, [lowq + posq] |
||||
add posq, lwidthq |
||||
movu m1, [lowq + posq] |
||||
mova m2, m0 |
||||
punpcklwd m0, m1 |
||||
punpckhwd m2, m1 |
||||
|
||||
%if ARCH_X86_64 |
||||
pmaddwd m0, m12 |
||||
pmaddwd m2, m12 |
||||
%else |
||||
pmaddwd m0, [factor_p11_n4] |
||||
pmaddwd m2, [factor_p11_n4] |
||||
%endif |
||||
|
||||
pxor m4, m4 |
||||
add posq, lwidthq |
||||
movu m1, [lowq + posq] |
||||
mova m3, m4 |
||||
punpcklwd m4, m1 |
||||
punpckhwd m3, m1 |
||||
|
||||
psrad m4, 16 |
||||
psrad m3, 16 |
||||
|
||||
paddd m0, m4 |
||||
paddd m2, m3 |
||||
|
||||
paddd m0, [pd_4] |
||||
paddd m2, [pd_4] |
||||
|
||||
psrad m0, 3 |
||||
psrad m2, 3 |
||||
|
||||
mov posq, xq |
||||
pxor m4, m4 |
||||
movu m1, [highq + posq] |
||||
mova m3, m4 |
||||
punpcklwd m4, m1 |
||||
punpckhwd m3, m1 |
||||
|
||||
psrad m4, 16 |
||||
psrad m3, 16 |
||||
|
||||
paddd m0, m4 |
||||
paddd m2, m3 |
||||
|
||||
psrad m0, 1 |
||||
psrad m2, 1 |
||||
|
||||
packssdw m0, m2 |
||||
|
||||
movu [outputq + posq], m0 |
||||
|
||||
movu m0, [lowq + posq] |
||||
add posq, lwidthq |
||||
movu m1, [lowq + posq] |
||||
mova m2, m0 |
||||
punpcklwd m0, m1 |
||||
punpckhwd m2, m1 |
||||
|
||||
%ifdef ARCH_X86_64 |
||||
pmaddwd m0, m13 |
||||
pmaddwd m2, m13 |
||||
%else |
||||
pmaddwd m0, [factor_p5_p4] |
||||
pmaddwd m2, [factor_p5_p4] |
||||
%endif |
||||
|
||||
pxor m4, m4 |
||||
add posq, lwidthq |
||||
movu m1, [lowq + posq] |
||||
mova m3, m4 |
||||
punpcklwd m4, m1 |
||||
punpckhwd m3, m1 |
||||
|
||||
psrad m4, 16 |
||||
psrad m3, 16 |
||||
|
||||
psubd m0, m4 |
||||
psubd m2, m3 |
||||
|
||||
paddd m0, [pd_4] |
||||
paddd m2, [pd_4] |
||||
|
||||
psrad m0, 3 |
||||
psrad m2, 3 |
||||
|
||||
mov posq, xq |
||||
pxor m4, m4 |
||||
movu m1, [highq + posq] |
||||
mova m3, m4 |
||||
punpcklwd m4, m1 |
||||
punpckhwd m3, m1 |
||||
|
||||
psrad m4, 16 |
||||
psrad m3, 16 |
||||
|
||||
psubd m0, m4 |
||||
psubd m2, m3 |
||||
|
||||
psrad m0, 1 |
||||
psrad m2, 1 |
||||
|
||||
packssdw m0, m2 |
||||
|
||||
add posq, ostrideq |
||||
movu [outputq + posq], m0 |
||||
|
||||
add yq, 1 |
||||
.looph: |
||||
mov posq, lwidthq |
||||
imul posq, yq |
||||
sub posq, lwidthq |
||||
add posq, xq |
||||
|
||||
movu m4, [lowq + posq] |
||||
|
||||
add posq, lwidthq |
||||
add posq, lwidthq |
||||
movu m1, [lowq + posq] |
||||
|
||||
mova m5, m4 |
||||
punpcklwd m4, m1 |
||||
punpckhwd m5, m1 |
||||
|
||||
mova m6, m4 |
||||
mova m7, m5 |
||||
|
||||
%ifdef ARCH_X86_64 |
||||
pmaddwd m4, m8 |
||||
pmaddwd m5, m8 |
||||
pmaddwd m6, m9 |
||||
pmaddwd m7, m9 |
||||
|
||||
paddd m4, m11 |
||||
paddd m5, m11 |
||||
paddd m6, m11 |
||||
paddd m7, m11 |
||||
%else |
||||
pmaddwd m4, [factor_p1_n1] |
||||
pmaddwd m5, [factor_p1_n1] |
||||
pmaddwd m6, [factor_n1_p1] |
||||
pmaddwd m7, [factor_n1_p1] |
||||
|
||||
paddd m4, [pd_4] |
||||
paddd m5, [pd_4] |
||||
paddd m6, [pd_4] |
||||
paddd m7, [pd_4] |
||||
%endif |
||||
|
||||
psrad m4, 3 |
||||
psrad m5, 3 |
||||
psrad m6, 3 |
||||
psrad m7, 3 |
||||
|
||||
sub posq, lwidthq |
||||
movu m0, [lowq + posq] |
||||
|
||||
mov posq, hwidthq |
||||
imul posq, yq |
||||
add posq, xq |
||||
movu m1, [highq + posq] |
||||
|
||||
mova m2, m0 |
||||
punpcklwd m0, m1 |
||||
punpckhwd m2, m1 |
||||
|
||||
mova m1, m0 |
||||
mova m3, m2 |
||||
|
||||
%ifdef ARCH_X86_64 |
||||
pmaddwd m0, m10 |
||||
pmaddwd m2, m10 |
||||
pmaddwd m1, m8 |
||||
pmaddwd m3, m8 |
||||
%else |
||||
pmaddwd m0, [pw_1] |
||||
pmaddwd m2, [pw_1] |
||||
pmaddwd m1, [factor_p1_n1] |
||||
pmaddwd m3, [factor_p1_n1] |
||||
%endif |
||||
|
||||
paddd m0, m4 |
||||
paddd m2, m5 |
||||
paddd m1, m6 |
||||
paddd m3, m7 |
||||
|
||||
psrad m0, 1 |
||||
psrad m2, 1 |
||||
psrad m1, 1 |
||||
psrad m3, 1 |
||||
|
||||
packssdw m0, m2 |
||||
packssdw m1, m3 |
||||
|
||||
mov posq, ostrideq |
||||
imul posq, 2 |
||||
imul posq, yq |
||||
add posq, xq |
||||
|
||||
movu [outputq + posq], m0 |
||||
add posq, ostrideq |
||||
movu [outputq + posq], m1 |
||||
|
||||
add yq, 1 |
||||
cmp yq, heightq |
||||
jl .looph |
||||
|
||||
mov posq, lwidthq |
||||
imul posq, yq |
||||
add posq, xq |
||||
movu m0, [lowq + posq] |
||||
sub posq, lwidthq |
||||
movu m1, [lowq + posq] |
||||
mova m2, m0 |
||||
punpcklwd m0, m1 |
||||
punpckhwd m2, m1 |
||||
|
||||
%ifdef ARCH_X86_64 |
||||
pmaddwd m0, m13 |
||||
pmaddwd m2, m13 |
||||
%else |
||||
pmaddwd m0, [factor_p5_p4] |
||||
pmaddwd m2, [factor_p5_p4] |
||||
%endif |
||||
|
||||
pxor m4, m4 |
||||
sub posq, lwidthq |
||||
movu m1, [lowq + posq] |
||||
mova m3, m4 |
||||
punpcklwd m4, m1 |
||||
punpckhwd m3, m1 |
||||
|
||||
psrad m4, 16 |
||||
psrad m3, 16 |
||||
|
||||
psubd m0, m4 |
||||
psubd m2, m3 |
||||
|
||||
%ifdef ARCH_X86_64 |
||||
paddd m0, m11 |
||||
paddd m2, m11 |
||||
%else |
||||
paddd m0, [pd_4] |
||||
paddd m2, [pd_4] |
||||
%endif |
||||
|
||||
psrad m0, 3 |
||||
psrad m2, 3 |
||||
|
||||
mov posq, hwidthq |
||||
imul posq, yq |
||||
add posq, xq |
||||
pxor m4, m4 |
||||
movu m1, [highq + posq] |
||||
mova m3, m4 |
||||
punpcklwd m4, m1 |
||||
punpckhwd m3, m1 |
||||
|
||||
psrad m4, 16 |
||||
psrad m3, 16 |
||||
|
||||
paddd m0, m4 |
||||
paddd m2, m3 |
||||
|
||||
psrad m0, 1 |
||||
psrad m2, 1 |
||||
|
||||
packssdw m0, m2 |
||||
|
||||
mov posq, ostrideq |
||||
imul posq, 2 |
||||
imul posq, yq |
||||
add posq, xq |
||||
movu [outputq + posq], m0 |
||||
|
||||
mov posq, lwidthq |
||||
imul posq, yq |
||||
add posq, xq |
||||
movu m0, [lowq + posq] |
||||
sub posq, lwidthq |
||||
movu m1, [lowq + posq] |
||||
mova m2, m0 |
||||
punpcklwd m0, m1 |
||||
punpckhwd m2, m1 |
||||
|
||||
%ifdef ARCH_X86_64 |
||||
pmaddwd m0, m12 |
||||
pmaddwd m2, m12 |
||||
%else |
||||
pmaddwd m0, [factor_p11_n4] |
||||
pmaddwd m2, [factor_p11_n4] |
||||
%endif |
||||
|
||||
pxor m4, m4 |
||||
sub posq, lwidthq |
||||
movu m1, [lowq + posq] |
||||
mova m3, m4 |
||||
punpcklwd m4, m1 |
||||
punpckhwd m3, m1 |
||||
|
||||
psrad m4, 16 |
||||
psrad m3, 16 |
||||
|
||||
paddd m0, m4 |
||||
paddd m2, m3 |
||||
|
||||
%ifdef ARCH_X86_64 |
||||
paddd m0, m11 |
||||
paddd m2, m11 |
||||
%else |
||||
paddd m0, [pd_4] |
||||
paddd m2, [pd_4] |
||||
%endif |
||||
|
||||
psrad m0, 3 |
||||
psrad m2, 3 |
||||
|
||||
mov posq, hwidthq |
||||
imul posq, yq |
||||
add posq, xq |
||||
pxor m4, m4 |
||||
movu m1, [highq + posq] |
||||
mova m3, m4 |
||||
punpcklwd m4, m1 |
||||
punpckhwd m3, m1 |
||||
|
||||
psrad m4, 16 |
||||
psrad m3, 16 |
||||
|
||||
psubd m0, m4 |
||||
psubd m2, m3 |
||||
|
||||
psrad m0, 1 |
||||
psrad m2, 1 |
||||
|
||||
packssdw m0, m2 |
||||
|
||||
mov posq, ostrideq |
||||
imul posq, 2 |
||||
imul posq, yq |
||||
add posq, ostrideq |
||||
add posq, xq |
||||
movu [outputq + posq], m0 |
||||
|
||||
add xq, mmsize |
||||
cmp xq, widthq |
||||
jl .loopw |
||||
RET |
@ -0,0 +1,52 @@ |
||||
/*
|
||||
* Copyright (c) 2020 Paul B Mahol |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#include <stdint.h> |
||||
|
||||
#include "libavutil/attributes.h" |
||||
#include "libavutil/cpu.h" |
||||
#include "libavutil/x86/cpu.h" |
||||
#include "libavcodec/avcodec.h" |
||||
#include "libavcodec/cfhddsp.h" |
||||
|
||||
void ff_cfhd_horiz_filter_sse2(int16_t *output, ptrdiff_t out_stride, |
||||
const int16_t *low, ptrdiff_t low_stride, |
||||
const int16_t *high, ptrdiff_t high_stride, |
||||
int width, int height); |
||||
void ff_cfhd_vert_filter_sse2(int16_t *output, ptrdiff_t out_stride, |
||||
const int16_t *low, ptrdiff_t low_stride, |
||||
const int16_t *high, ptrdiff_t high_stride, |
||||
int width, int height); |
||||
void ff_cfhd_horiz_filter_clip10_sse2(int16_t *output, const int16_t *low, const int16_t *high, int width, int bpc); |
||||
void ff_cfhd_horiz_filter_clip12_sse2(int16_t *output, const int16_t *low, const int16_t *high, int width, int bpc); |
||||
|
||||
av_cold void ff_cfhddsp_init_x86(CFHDDSPContext *c, int depth, int bayer) |
||||
{ |
||||
int cpu_flags = av_get_cpu_flags(); |
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) { |
||||
c->horiz_filter = ff_cfhd_horiz_filter_sse2; |
||||
c->vert_filter = ff_cfhd_vert_filter_sse2; |
||||
if (depth == 10 && !bayer) |
||||
c->horiz_filter_clip = ff_cfhd_horiz_filter_clip10_sse2; |
||||
if (depth == 12 && !bayer) |
||||
c->horiz_filter_clip = ff_cfhd_horiz_filter_clip12_sse2; |
||||
} |
||||
} |
Loading…
Reference in new issue