diff --git a/libswscale/x86/range_convert.asm b/libswscale/x86/range_convert.asm index ffda009c4e..27be2a4b31 100644 --- a/libswscale/x86/range_convert.asm +++ b/libswscale/x86/range_convert.asm @@ -20,39 +20,29 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA - -chr_to_mult: times 4 dw 4663, 0 -chr_to_offset: times 4 dd -9289992 -%define chr_to_shift 12 - -chr_from_mult: times 4 dw 1799, 0 -chr_from_offset: times 4 dd 4081085 -%define chr_from_shift 11 - -lum_to_mult: times 4 dw 19077, 0 -lum_to_offset: times 4 dd -39057361 -%define lum_to_shift 14 - -lum_from_mult: times 4 dw 14071, 0 -lum_from_offset: times 4 dd 33561947 -%define lum_from_shift 14 - SECTION .text ;----------------------------------------------------------------------------- ; lumConvertRange ; -; void ff_lumRangeToJpeg_(int16_t *dst, int width); -; void ff_lumRangeFromJpeg_(int16_t *dst, int width); +; void ff_lumRangeToJpeg_(int16_t *dst, int width, +; uint32_t coeff, int64_t offset); +; void ff_lumRangeFromJpeg_(int16_t *dst, int width, +; uint32_t coeff, int64_t offset); ; ;----------------------------------------------------------------------------- -%macro LUMCONVERTRANGE 4 -cglobal %1, 2, 2, 5, dst, width +%macro LUMCONVERTRANGE 1 +cglobal lumRange%1Jpeg, 4, 4, 5, dst, width, coeff, offset shl widthd, 1 - VBROADCASTI128 m2, [%2] - VBROADCASTI128 m3, [%3] + movd xm2, coeffd + VBROADCASTSS m2, xm2 +%if ARCH_X86_64 + movq xm3, offsetq +%else + movq xm3, offsetm +%endif + VBROADCASTSS m3, xm3 pxor m4, m4 add dstq, widthq neg widthq @@ -64,8 +54,8 @@ cglobal %1, 2, 2, 5, dst, width pmaddwd m1, m2 paddd m0, m3 paddd m1, m3 - psrad m0, %4 - psrad m1, %4 + psrad m0, 14 + psrad m1, 14 packssdw m0, m1 movu [dstq+widthq], m0 add widthq, mmsize @@ -76,16 +66,24 @@ cglobal %1, 2, 2, 5, dst, width ;----------------------------------------------------------------------------- ; chrConvertRange ; -; void ff_chrRangeToJpeg_(int16_t *dstU, int16_t *dstV, int width); -; void ff_chrRangeFromJpeg_(int16_t *dstU, int16_t *dstV, int width); +; void ff_chrRangeToJpeg_(int16_t *dstU, int16_t *dstV, int width, +; uint32_t coeff, int64_t offset); +; void ff_chrRangeFromJpeg_(int16_t *dstU, int16_t *dstV, int width, +; uint32_t coeff, int64_t offset); ; ;----------------------------------------------------------------------------- -%macro CHRCONVERTRANGE 4 -cglobal %1, 3, 3, 7, dstU, dstV, width +%macro CHRCONVERTRANGE 1 +cglobal chrRange%1Jpeg, 5, 5, 7, dstU, dstV, width, coeff, offset shl widthd, 1 - VBROADCASTI128 m4, [%2] - VBROADCASTI128 m5, [%3] + movd xm4, coeffd + VBROADCASTSS m4, xm4 +%if ARCH_X86_64 + movq xm5, offsetq +%else + movq xm5, offsetm +%endif + VBROADCASTSS m5, xm5 pxor m6, m6 add dstUq, widthq add dstVq, widthq @@ -105,10 +103,10 @@ cglobal %1, 3, 3, 7, dstU, dstV, width paddd m1, m5 paddd m2, m5 paddd m3, m5 - psrad m0, %4 - psrad m1, %4 - psrad m2, %4 - psrad m3, %4 + psrad m0, 14 + psrad m1, 14 + psrad m2, 14 + psrad m3, 14 packssdw m0, m1 packssdw m2, m3 movu [dstUq+widthq], m0 @@ -119,15 +117,15 @@ cglobal %1, 3, 3, 7, dstU, dstV, width %endmacro INIT_XMM sse2 -LUMCONVERTRANGE lumRangeToJpeg, lum_to_mult, lum_to_offset, lum_to_shift -CHRCONVERTRANGE chrRangeToJpeg, chr_to_mult, chr_to_offset, chr_to_shift -LUMCONVERTRANGE lumRangeFromJpeg, lum_from_mult, lum_from_offset, lum_from_shift -CHRCONVERTRANGE chrRangeFromJpeg, chr_from_mult, chr_from_offset, chr_from_shift +LUMCONVERTRANGE To +CHRCONVERTRANGE To +LUMCONVERTRANGE From +CHRCONVERTRANGE From %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 -LUMCONVERTRANGE lumRangeToJpeg, lum_to_mult, lum_to_offset, lum_to_shift -CHRCONVERTRANGE chrRangeToJpeg, chr_to_mult, chr_to_offset, chr_to_shift -LUMCONVERTRANGE lumRangeFromJpeg, lum_from_mult, lum_from_offset, lum_from_shift -CHRCONVERTRANGE chrRangeFromJpeg, chr_from_mult, chr_from_offset, chr_from_shift +LUMCONVERTRANGE To +CHRCONVERTRANGE To +LUMCONVERTRANGE From +CHRCONVERTRANGE From %endif diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 2722c4bdc6..550ad99f3f 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -464,27 +464,26 @@ INPUT_PLANAR_RGB_A_ALL_DECL(avx2); } while (0) #define RANGE_CONVERT_FUNCS_DECL(opt) \ -void ff_lumRangeFromJpeg_ ##opt(int16_t *dst, int width); \ -void ff_chrRangeFromJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width); \ -void ff_lumRangeToJpeg_ ##opt(int16_t *dst, int width); \ -void ff_chrRangeToJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width); \ +void ff_lumRangeFromJpeg_ ##opt(int16_t *dst, int width, \ + uint32_t coeff, int64_t offset); \ +void ff_chrRangeFromJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width, \ + uint32_t coeff, int64_t offset); \ +void ff_lumRangeToJpeg_ ##opt(int16_t *dst, int width, \ + uint32_t coeff, int64_t offset); \ +void ff_chrRangeToJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width, \ + uint32_t coeff, int64_t offset); \ RANGE_CONVERT_FUNCS_DECL(sse2); RANGE_CONVERT_FUNCS_DECL(avx2); av_cold void ff_sws_init_range_convert_x86(SwsInternal *c) { - /* This code is currently disabled because of changes in the base - * implementation of these functions. This code should be enabled - * again once those changes are ported to this architecture. */ -#if 0 int cpu_flags = av_get_cpu_flags(); if (EXTERNAL_AVX2_FAST(cpu_flags)) { RANGE_CONVERT_FUNCS(avx2); } else if (EXTERNAL_SSE2(cpu_flags)) { RANGE_CONVERT_FUNCS(sse2); } -#endif } av_cold void ff_sws_init_swscale_x86(SwsInternal *c)