swscale/x86/range_convert: update sse2 and avx2 range_convert functions to new API

chrRangeFromJpeg8_1920_c:    2127.4 (1.00x)
chrRangeFromJpeg8_1920_sse2:  816.0 (2.61x)  813.5 (2.62x)
chrRangeFromJpeg8_1920_avx2:  408.9 (5.20x)  405.4 (5.25x)
chrRangeToJpeg8_1920_c:      3166.9 (1.00x)
chrRangeToJpeg8_1920_sse2:    815.0 (3.89x)  815.0 (3.89x)
chrRangeToJpeg8_1920_avx2:    404.5 (7.83x)  405.5 (7.81x)
lumRangeFromJpeg8_1920_c:    1263.0 (1.00x)
lumRangeFromJpeg8_1920_sse2:  411.0 (3.07x)  413.2 (3.06x)
lumRangeFromJpeg8_1920_avx2:  200.5 (6.30x)  201.9 (6.26x)
lumRangeToJpeg8_1920_c:      1886.8 (1.00x)
lumRangeToJpeg8_1920_sse2:    412.0 (4.58x)  408.9 (4.61x)
lumRangeToJpeg8_1920_avx2:    208.5 (9.05x)  205.7 (9.17x)
pull/391/head
Ramiro Polla 5 months ago
parent 384fe39623
commit be108ebcf4
  1. 86
      libswscale/x86/range_convert.asm
  2. 17
      libswscale/x86/swscale.c

@ -20,39 +20,29 @@
%include "libavutil/x86/x86util.asm" %include "libavutil/x86/x86util.asm"
SECTION_RODATA
chr_to_mult: times 4 dw 4663, 0
chr_to_offset: times 4 dd -9289992
%define chr_to_shift 12
chr_from_mult: times 4 dw 1799, 0
chr_from_offset: times 4 dd 4081085
%define chr_from_shift 11
lum_to_mult: times 4 dw 19077, 0
lum_to_offset: times 4 dd -39057361
%define lum_to_shift 14
lum_from_mult: times 4 dw 14071, 0
lum_from_offset: times 4 dd 33561947
%define lum_from_shift 14
SECTION .text SECTION .text
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; lumConvertRange ; lumConvertRange
; ;
; void ff_lumRangeToJpeg_<opt>(int16_t *dst, int width); ; void ff_lumRangeToJpeg_<opt>(int16_t *dst, int width,
; void ff_lumRangeFromJpeg_<opt>(int16_t *dst, int width); ; uint32_t coeff, int64_t offset);
; void ff_lumRangeFromJpeg_<opt>(int16_t *dst, int width,
; uint32_t coeff, int64_t offset);
; ;
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro LUMCONVERTRANGE 4 %macro LUMCONVERTRANGE 1
cglobal %1, 2, 2, 5, dst, width cglobal lumRange%1Jpeg, 4, 4, 5, dst, width, coeff, offset
shl widthd, 1 shl widthd, 1
VBROADCASTI128 m2, [%2] movd xm2, coeffd
VBROADCASTI128 m3, [%3] VBROADCASTSS m2, xm2
%if ARCH_X86_64
movq xm3, offsetq
%else
movq xm3, offsetm
%endif
VBROADCASTSS m3, xm3
pxor m4, m4 pxor m4, m4
add dstq, widthq add dstq, widthq
neg widthq neg widthq
@ -64,8 +54,8 @@ cglobal %1, 2, 2, 5, dst, width
pmaddwd m1, m2 pmaddwd m1, m2
paddd m0, m3 paddd m0, m3
paddd m1, m3 paddd m1, m3
psrad m0, %4 psrad m0, 14
psrad m1, %4 psrad m1, 14
packssdw m0, m1 packssdw m0, m1
movu [dstq+widthq], m0 movu [dstq+widthq], m0
add widthq, mmsize add widthq, mmsize
@ -76,16 +66,24 @@ cglobal %1, 2, 2, 5, dst, width
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; chrConvertRange ; chrConvertRange
; ;
; void ff_chrRangeToJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width); ; void ff_chrRangeToJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width,
; void ff_chrRangeFromJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width); ; uint32_t coeff, int64_t offset);
; void ff_chrRangeFromJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width,
; uint32_t coeff, int64_t offset);
; ;
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro CHRCONVERTRANGE 4 %macro CHRCONVERTRANGE 1
cglobal %1, 3, 3, 7, dstU, dstV, width cglobal chrRange%1Jpeg, 5, 5, 7, dstU, dstV, width, coeff, offset
shl widthd, 1 shl widthd, 1
VBROADCASTI128 m4, [%2] movd xm4, coeffd
VBROADCASTI128 m5, [%3] VBROADCASTSS m4, xm4
%if ARCH_X86_64
movq xm5, offsetq
%else
movq xm5, offsetm
%endif
VBROADCASTSS m5, xm5
pxor m6, m6 pxor m6, m6
add dstUq, widthq add dstUq, widthq
add dstVq, widthq add dstVq, widthq
@ -105,10 +103,10 @@ cglobal %1, 3, 3, 7, dstU, dstV, width
paddd m1, m5 paddd m1, m5
paddd m2, m5 paddd m2, m5
paddd m3, m5 paddd m3, m5
psrad m0, %4 psrad m0, 14
psrad m1, %4 psrad m1, 14
psrad m2, %4 psrad m2, 14
psrad m3, %4 psrad m3, 14
packssdw m0, m1 packssdw m0, m1
packssdw m2, m3 packssdw m2, m3
movu [dstUq+widthq], m0 movu [dstUq+widthq], m0
@ -119,15 +117,15 @@ cglobal %1, 3, 3, 7, dstU, dstV, width
%endmacro %endmacro
INIT_XMM sse2 INIT_XMM sse2
LUMCONVERTRANGE lumRangeToJpeg, lum_to_mult, lum_to_offset, lum_to_shift LUMCONVERTRANGE To
CHRCONVERTRANGE chrRangeToJpeg, chr_to_mult, chr_to_offset, chr_to_shift CHRCONVERTRANGE To
LUMCONVERTRANGE lumRangeFromJpeg, lum_from_mult, lum_from_offset, lum_from_shift LUMCONVERTRANGE From
CHRCONVERTRANGE chrRangeFromJpeg, chr_from_mult, chr_from_offset, chr_from_shift CHRCONVERTRANGE From
%if HAVE_AVX2_EXTERNAL %if HAVE_AVX2_EXTERNAL
INIT_YMM avx2 INIT_YMM avx2
LUMCONVERTRANGE lumRangeToJpeg, lum_to_mult, lum_to_offset, lum_to_shift LUMCONVERTRANGE To
CHRCONVERTRANGE chrRangeToJpeg, chr_to_mult, chr_to_offset, chr_to_shift CHRCONVERTRANGE To
LUMCONVERTRANGE lumRangeFromJpeg, lum_from_mult, lum_from_offset, lum_from_shift LUMCONVERTRANGE From
CHRCONVERTRANGE chrRangeFromJpeg, chr_from_mult, chr_from_offset, chr_from_shift CHRCONVERTRANGE From
%endif %endif

@ -464,27 +464,26 @@ INPUT_PLANAR_RGB_A_ALL_DECL(avx2);
} while (0) } while (0)
#define RANGE_CONVERT_FUNCS_DECL(opt) \ #define RANGE_CONVERT_FUNCS_DECL(opt) \
void ff_lumRangeFromJpeg_ ##opt(int16_t *dst, int width); \ void ff_lumRangeFromJpeg_ ##opt(int16_t *dst, int width, \
void ff_chrRangeFromJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width); \ uint32_t coeff, int64_t offset); \
void ff_lumRangeToJpeg_ ##opt(int16_t *dst, int width); \ void ff_chrRangeFromJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width, \
void ff_chrRangeToJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width); \ uint32_t coeff, int64_t offset); \
void ff_lumRangeToJpeg_ ##opt(int16_t *dst, int width, \
uint32_t coeff, int64_t offset); \
void ff_chrRangeToJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width, \
uint32_t coeff, int64_t offset); \
RANGE_CONVERT_FUNCS_DECL(sse2); RANGE_CONVERT_FUNCS_DECL(sse2);
RANGE_CONVERT_FUNCS_DECL(avx2); RANGE_CONVERT_FUNCS_DECL(avx2);
av_cold void ff_sws_init_range_convert_x86(SwsInternal *c) av_cold void ff_sws_init_range_convert_x86(SwsInternal *c)
{ {
/* This code is currently disabled because of changes in the base
* implementation of these functions. This code should be enabled
* again once those changes are ported to this architecture. */
#if 0
int cpu_flags = av_get_cpu_flags(); int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_AVX2_FAST(cpu_flags)) { if (EXTERNAL_AVX2_FAST(cpu_flags)) {
RANGE_CONVERT_FUNCS(avx2); RANGE_CONVERT_FUNCS(avx2);
} else if (EXTERNAL_SSE2(cpu_flags)) { } else if (EXTERNAL_SSE2(cpu_flags)) {
RANGE_CONVERT_FUNCS(sse2); RANGE_CONVERT_FUNCS(sse2);
} }
#endif
} }
av_cold void ff_sws_init_swscale_x86(SwsInternal *c) av_cold void ff_sws_init_swscale_x86(SwsInternal *c)

Loading…
Cancel
Save