diff --git a/libavcodec/jpeg2000.c b/libavcodec/jpeg2000.c index 9304536291..af24e998ca 100644 --- a/libavcodec/jpeg2000.c +++ b/libavcodec/jpeg2000.c @@ -215,6 +215,7 @@ int ff_jpeg2000_init_component(Jpeg2000Component *comp, (comp->coord[1][1] - comp->coord[1][0]); if (codsty->transform == FF_DWT97) { + csize += FF_INPUT_BUFFER_PADDING_SIZE / sizeof(*comp->f_data); comp->i_data = NULL; comp->f_data = av_mallocz_array(csize, sizeof(*comp->f_data)); if (!comp->f_data) diff --git a/libavcodec/jpeg2000dsp.c b/libavcodec/jpeg2000dsp.c index a7c7f53b7a..d183cbb87d 100644 --- a/libavcodec/jpeg2000dsp.c +++ b/libavcodec/jpeg2000dsp.c @@ -95,4 +95,7 @@ av_cold void ff_jpeg2000dsp_init(Jpeg2000DSPContext *c) c->mct_decode[FF_DWT97] = ict_float; c->mct_decode[FF_DWT53] = rct_int; c->mct_decode[FF_DWT97_INT] = ict_int; + + if (ARCH_X86) + ff_jpeg2000dsp_init_x86(c); } diff --git a/libavcodec/jpeg2000dsp.h b/libavcodec/jpeg2000dsp.h index de1ddb94cd..1ae5b95d9a 100644 --- a/libavcodec/jpeg2000dsp.h +++ b/libavcodec/jpeg2000dsp.h @@ -31,5 +31,6 @@ typedef struct Jpeg2000DSPContext { } Jpeg2000DSPContext; void ff_jpeg2000dsp_init(Jpeg2000DSPContext *c); +void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c); #endif /* AVCODEC_JPEG2000DSP_H */ diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 87985f279b..08cee1c4fd 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -44,6 +44,7 @@ OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o OBJS-$(CONFIG_HEVC_DECODER) += x86/hevcdsp_init.o +OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp_init.o OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp_init.o OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct_init.o OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o @@ -138,6 +139,7 @@ YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_mc.o \ x86/hevc_idct.o \ x86/hevc_res_add.o \ x86/hevc_sao.o +YASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o YASM-OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o YASM-OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct.o YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o diff --git a/libavcodec/x86/jpeg2000dsp.asm b/libavcodec/x86/jpeg2000dsp.asm new file mode 100644 index 0000000000..0d79ab7703 --- /dev/null +++ b/libavcodec/x86/jpeg2000dsp.asm @@ -0,0 +1,108 @@ +;****************************************************************************** +;* SIMD-optimized JPEG2000 DSP functions +;* Copyright (c) 2014 Nicolas Bertrand +;* Copyright (c) 2015 James Almer +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 + +pf_ict0: times 8 dd 1.402 +pf_ict1: times 8 dd 0.34413 +pf_ict2: times 8 dd 0.71414 +pf_ict3: times 8 dd 1.772 + +SECTION .text + +;*********************************************************************** +; ff_ict_float_(float *src0, float *src1, float *src2, int csize) +;*********************************************************************** +%macro ICT_FLOAT 1 +cglobal ict_float, 4, 4, %1, src0, src1, src2, csize + shl csized, 2 + add src0q, csizeq + add src1q, csizeq + add src2q, csizeq + neg csizeq + movaps m6, [pf_ict0] + movaps m7, [pf_ict1] + %define ICT0 m6 + %define ICT1 m7 + +%if ARCH_X86_64 + movaps m8, [pf_ict2] + %define ICT2 m8 +%if cpuflag(avx) + movaps m3, [pf_ict3] + %define ICT3 m3 +%else + movaps m9, [pf_ict3] + %define ICT3 m9 +%endif + +%else ; ARCH_X86_32 + %define ICT2 [pf_ict2] +%if cpuflag(avx) + movaps m3, [pf_ict3] + %define ICT3 m3 +%else + %define ICT3 [pf_ict3] +%endif + +%endif ; ARCH + +align 16 +.loop + movaps m0, [src0q+csizeq] + movaps m1, [src1q+csizeq] + movaps m2, [src2q+csizeq] + +%if cpuflag(avx) + mulps m5, m1, ICT1 + mulps m4, m2, ICT0 + mulps m1, m1, ICT3 + mulps m2, m2, ICT2 + subps m5, m0, m5 +%else ; sse + movaps m3, m1 + movaps m4, m2 + movaps m5, m0 + mulps m3, ICT1 + mulps m4, ICT0 + mulps m1, ICT3 + mulps m2, ICT2 + subps m5, m3 +%endif + addps m4, m4, m0 + addps m0, m0, m1 + subps m5, m5, m2 + + movaps [src0q+csizeq], m4 + movaps [src2q+csizeq], m0 + movaps [src1q+csizeq], m5 + add csizeq, mmsize + jl .loop + REP_RET +%endmacro + +INIT_XMM sse +ICT_FLOAT 10 +INIT_YMM avx +ICT_FLOAT 9 diff --git a/libavcodec/x86/jpeg2000dsp_init.c b/libavcodec/x86/jpeg2000dsp_init.c new file mode 100644 index 0000000000..43b9ccd6cb --- /dev/null +++ b/libavcodec/x86/jpeg2000dsp_init.c @@ -0,0 +1,40 @@ +/* + * SIMD optimized JPEG 2000 DSP functions + * Copyright (c) 2015 James Almer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/jpeg2000dsp.h" + +void ff_ict_float_sse(void *src0, void *src1, void *src2, int csize); +void ff_ict_float_avx(void *src0, void *src1, void *src2, int csize); + +av_cold void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c) +{ + int cpu_flags = av_get_cpu_flags(); + if (EXTERNAL_SSE(cpu_flags)) { + c->mct_decode[FF_DWT97] = ff_ict_float_sse; + } + + if (EXTERNAL_AVX_FAST(cpu_flags)) { + c->mct_decode[FF_DWT97] = ff_ict_float_avx; + } +}