swscale/x86: add sse2 and avx2 {lum,chr}ConvertRange

chrRangeFromJpeg_8_c: 22.3
chrRangeFromJpeg_8_sse2: 13.3
chrRangeFromJpeg_8_avx2: 13.3
chrRangeFromJpeg_24_c: 72.8
chrRangeFromJpeg_24_sse2: 22.3
chrRangeFromJpeg_24_avx2: 17.5
chrRangeFromJpeg_128_c: 345.5
chrRangeFromJpeg_128_sse2: 106.0
chrRangeFromJpeg_128_avx2: 57.8
chrRangeFromJpeg_144_c: 380.5
chrRangeFromJpeg_144_sse2: 118.5
chrRangeFromJpeg_144_avx2: 62.3
chrRangeFromJpeg_256_c: 646.3
chrRangeFromJpeg_256_sse2: 218.8
chrRangeFromJpeg_256_avx2: 109.0
chrRangeFromJpeg_512_c: 1461.5
chrRangeFromJpeg_512_sse2: 426.5
chrRangeFromJpeg_512_avx2: 211.5
chrRangeToJpeg_8_c: 37.8
chrRangeToJpeg_8_sse2: 10.5
chrRangeToJpeg_8_avx2: 14.0
chrRangeToJpeg_24_c: 114.3
chrRangeToJpeg_24_sse2: 23.5
chrRangeToJpeg_24_avx2: 16.3
chrRangeToJpeg_128_c: 633.5
chrRangeToJpeg_128_sse2: 107.5
chrRangeToJpeg_128_avx2: 55.0
chrRangeToJpeg_144_c: 758.3
chrRangeToJpeg_144_sse2: 132.0
chrRangeToJpeg_144_avx2: 64.5
chrRangeToJpeg_256_c: 1345.0
chrRangeToJpeg_256_sse2: 218.0
chrRangeToJpeg_256_avx2: 105.3
chrRangeToJpeg_512_c: 2524.0
chrRangeToJpeg_512_sse2: 417.0
chrRangeToJpeg_512_avx2: 218.8
lumRangeFromJpeg_8_c: 11.8
lumRangeFromJpeg_8_sse2: 11.0
lumRangeFromJpeg_8_avx2: 10.3
lumRangeFromJpeg_24_c: 38.5
lumRangeFromJpeg_24_sse2: 15.5
lumRangeFromJpeg_24_avx2: 12.5
lumRangeFromJpeg_128_c: 232.3
lumRangeFromJpeg_128_sse2: 60.0
lumRangeFromJpeg_128_avx2: 26.8
lumRangeFromJpeg_144_c: 259.5
lumRangeFromJpeg_144_sse2: 65.3
lumRangeFromJpeg_144_avx2: 29.0
lumRangeFromJpeg_256_c: 464.5
lumRangeFromJpeg_256_sse2: 107.5
lumRangeFromJpeg_256_avx2: 54.0
lumRangeFromJpeg_512_c: 897.5
lumRangeFromJpeg_512_sse2: 224.5
lumRangeFromJpeg_512_avx2: 109.8
lumRangeToJpeg_8_c: 17.8
lumRangeToJpeg_8_sse2: 11.0
lumRangeToJpeg_8_avx2: 11.8
lumRangeToJpeg_24_c: 56.3
lumRangeToJpeg_24_sse2: 11.0
lumRangeToJpeg_24_avx2: 12.5
lumRangeToJpeg_128_c: 333.8
lumRangeToJpeg_128_sse2: 53.3
lumRangeToJpeg_128_avx2: 26.5
lumRangeToJpeg_144_c: 375.5
lumRangeToJpeg_144_sse2: 60.8
lumRangeToJpeg_144_avx2: 29.0
lumRangeToJpeg_256_c: 652.0
lumRangeToJpeg_256_sse2: 109.5
lumRangeToJpeg_256_avx2: 53.5
lumRangeToJpeg_512_c: 1284.3
lumRangeToJpeg_512_sse2: 218.0
lumRangeToJpeg_512_avx2: 108.3
release/7.1
Ramiro Polla 5 months ago
parent 874152033d
commit f6859cade3
  1. 1
      libswscale/swscale_internal.h
  2. 2
      libswscale/utils.c
  3. 1
      libswscale/x86/Makefile
  4. 134
      libswscale/x86/range_convert.asm
  5. 35
      libswscale/x86/swscale.c

@ -698,6 +698,7 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY);
av_cold void ff_sws_init_range_convert(SwsContext *c);
av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c);
av_cold void ff_sws_init_range_convert_x86(SwsContext *c);
SwsFunc ff_yuv2rgb_init_x86(SwsContext *c);
SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c);

@ -1082,6 +1082,8 @@ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
ff_sws_init_range_convert(c);
#if ARCH_LOONGARCH64
ff_sws_init_range_convert_loongarch(c);
#elif ARCH_X86
ff_sws_init_range_convert_x86(c);
#endif
}

@ -12,6 +12,7 @@ X86ASM-OBJS += x86/input.o \
x86/output.o \
x86/scale.o \
x86/scale_avx2.o \
x86/range_convert.o \
x86/rgb_2_rgb.o \
x86/yuv_2_rgb.o \
x86/yuv2yuvX.o \

@ -0,0 +1,134 @@
;******************************************************************************
;* Copyright (c) 2024 Ramiro Polla
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
chr_to_mult: times 4 dw 4663, 0
chr_to_offset: times 4 dd -9289992
%define chr_to_shift 12
chr_from_mult: times 4 dw 1799, 0
chr_from_offset: times 4 dd 4081085
%define chr_from_shift 11
lum_to_mult: times 4 dw 19077, 0
lum_to_offset: times 4 dd -39057361
%define lum_to_shift 14
lum_from_mult: times 4 dw 14071, 0
lum_from_offset: times 4 dd 33561947
%define lum_from_shift 14
SECTION .text
; NOTE: there is no need to clamp the input when converting to jpeg range
; (like we do in the C code) because packssdw will saturate the output.
;-----------------------------------------------------------------------------
; lumConvertRange
;
; void ff_lumRangeToJpeg_<opt>(int16_t *dst, int width);
; void ff_lumRangeFromJpeg_<opt>(int16_t *dst, int width);
;
;-----------------------------------------------------------------------------
%macro LUMCONVERTRANGE 4
cglobal %1, 2, 2, 7, dst, width
shl widthd, 1
VBROADCASTI128 m4, [%2]
VBROADCASTI128 m5, [%3]
pxor m6, m6
add dstq, widthq
neg widthq
.loop:
movu m0, [dstq+widthq]
punpckhwd m1, m0, m6
punpcklwd m0, m6
pmaddwd m0, m4
pmaddwd m1, m4
paddd m0, m5
paddd m1, m5
psrad m0, %4
psrad m1, %4
packssdw m0, m1
movu [dstq+widthq], m0
add widthq, mmsize
jl .loop
RET
%endmacro
;-----------------------------------------------------------------------------
; chrConvertRange
;
; void ff_chrRangeToJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width);
; void ff_chrRangeFromJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width);
;
;-----------------------------------------------------------------------------
%macro CHRCONVERTRANGE 4
cglobal %1, 3, 3, 7, dstU, dstV, width
shl widthd, 1
VBROADCASTI128 m4, [%2]
VBROADCASTI128 m5, [%3]
pxor m6, m6
add dstUq, widthq
add dstVq, widthq
neg widthq
.loop:
movu m0, [dstUq+widthq]
movu m2, [dstVq+widthq]
punpckhwd m1, m0, m6
punpckhwd m3, m2, m6
punpcklwd m0, m6
punpcklwd m2, m6
pmaddwd m0, m4
pmaddwd m1, m4
pmaddwd m2, m4
pmaddwd m3, m4
paddd m0, m5
paddd m1, m5
paddd m2, m5
paddd m3, m5
psrad m0, %4
psrad m1, %4
psrad m2, %4
psrad m3, %4
packssdw m0, m1
packssdw m2, m3
movu [dstUq+widthq], m0
movu [dstVq+widthq], m2
add widthq, mmsize
jl .loop
RET
%endmacro
INIT_XMM sse2
LUMCONVERTRANGE lumRangeToJpeg, lum_to_mult, lum_to_offset, lum_to_shift
CHRCONVERTRANGE chrRangeToJpeg, chr_to_mult, chr_to_offset, chr_to_shift
LUMCONVERTRANGE lumRangeFromJpeg, lum_from_mult, lum_from_offset, lum_from_shift
CHRCONVERTRANGE chrRangeFromJpeg, chr_from_mult, chr_from_offset, chr_from_shift
INIT_YMM avx2
LUMCONVERTRANGE lumRangeToJpeg, lum_to_mult, lum_to_offset, lum_to_shift
CHRCONVERTRANGE chrRangeToJpeg, chr_to_mult, chr_to_offset, chr_to_shift
LUMCONVERTRANGE lumRangeFromJpeg, lum_from_mult, lum_from_offset, lum_from_shift
CHRCONVERTRANGE chrRangeFromJpeg, chr_from_mult, chr_from_offset, chr_from_shift

@ -453,6 +453,39 @@ INPUT_PLANAR_RGB_UV_ALL_DECL(avx2);
INPUT_PLANAR_RGB_A_ALL_DECL(avx2);
#endif
#define RANGE_CONVERT_FUNCS(opt) do { \
if (c->dstBpc <= 14) { \
if (c->srcRange) { \
c->lumConvertRange = ff_lumRangeFromJpeg_ ##opt; \
c->chrConvertRange = ff_chrRangeFromJpeg_ ##opt; \
} else { \
c->lumConvertRange = ff_lumRangeToJpeg_ ##opt; \
c->chrConvertRange = ff_chrRangeToJpeg_ ##opt; \
} \
} \
} while (0)
#define RANGE_CONVERT_FUNCS_DECL(opt) \
void ff_lumRangeFromJpeg_ ##opt(int16_t *dst, int width); \
void ff_chrRangeFromJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width); \
void ff_lumRangeToJpeg_ ##opt(int16_t *dst, int width); \
void ff_chrRangeToJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width); \
RANGE_CONVERT_FUNCS_DECL(sse2);
RANGE_CONVERT_FUNCS_DECL(avx2);
av_cold void ff_sws_init_range_convert_x86(SwsContext *c)
{
if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
RANGE_CONVERT_FUNCS(avx2);
} else if (EXTERNAL_SSE2(cpu_flags)) {
RANGE_CONVERT_FUNCS(sse2);
}
}
}
av_cold void ff_sws_init_swscale_x86(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
@ -820,4 +853,6 @@ switch(c->dstBpc){ \
}
#endif
ff_sws_init_range_convert_x86(c);
}

Loading…
Cancel
Save