diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c index 8af9839b47..ea61c3c770 100644 --- a/libswscale/x86/rgb2rgb.c +++ b/libswscale/x86/rgb2rgb.c @@ -2383,6 +2383,9 @@ void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride); +void ff_uyvytoyuv422_avx512icl(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, + const uint8_t *src, int width, int height, + int lumStride, int chromStride, int srcStride); #endif #define DEINTERLEAVE_BYTES(cpuext) \ @@ -2477,6 +2480,9 @@ av_cold void rgb2rgb_init_x86(void) } if (EXTERNAL_AVX2_FAST(cpu_flags)) { uyvytoyuv422 = ff_uyvytoyuv422_avx2; + } + if (EXTERNAL_AVX512ICL(cpu_flags)) { + uyvytoyuv422 = ff_uyvytoyuv422_avx512icl; #endif } #endif diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm index ca7a481255..6e4df17298 100644 --- a/libswscale/x86/rgb_2_rgb.asm +++ b/libswscale/x86/rgb_2_rgb.asm @@ -35,6 +35,20 @@ pb_shuffle2013: db 2, 0, 1, 3, 6, 4, 5, 7, 10, 8, 9, 11, 14, 12, 13, 15 pb_shuffle2130: db 2, 1, 3, 0, 6, 5, 7, 4, 10, 9, 11, 8, 14, 13, 15, 12 pb_shuffle1203: db 1, 2, 0, 3, 5, 6, 4, 7, 9, 10, 8, 11, 13, 14, 12, 15 +%if HAVE_AVX512ICL_EXTERNAL +; shuffle vector to rearrange packuswb result to be linear +shuf_packus: db 0, 1, 2, 3, 16, 17, 18, 19, 32, 33, 34, 35, 48, 49, 50, 51,\ + 4, 5, 6, 7, 20, 21, 22, 23, 36, 37, 38, 39, 52, 53, 54, 55,\ + 8, 9, 10, 11, 24, 25, 26, 27, 40, 41, 42, 43, 56, 57, 58, 59,\ + 12, 13, 14, 15, 28, 29, 30, 31, 44, 45, 46, 47, 60, 61, 62, 63 + +; shuffle vector to combine odd elements from two vectors to extract Y +shuf_perm2b: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,\ + 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,\ + 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95,\ + 97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127 +%endif + SECTION .text %macro RSHIFT_COPY 5 @@ -156,9 +170,20 @@ SHUFFLE_BYTES 1, 2, 0, 3 ; int lumStride, int chromStride, int srcStride) ;----------------------------------------------------------------------------------------------- %macro UYVY_TO_YUV422 0 +%if mmsize == 64 +; need two more registers to store shuffle vectors for AVX512ICL +cglobal uyvytoyuv422, 9, 14, 10, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w +%else cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w +%endif pxor m0, m0 +%if mmsize == 64 + vpternlogd m1, m1, m1, 0xff ; m1 = _mm512_set1_epi8(0xff) + movu m8, [shuf_packus] + movu m9, [shuf_perm2b] +%else pcmpeqw m1, m1 +%endif psrlw m1, 8 movsxdifnidn wq, wd @@ -188,6 +213,63 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s and xq, mmsize * 2 - 1 je .loop_simd +%if mmsize == 64 + shr xq, 1 + mov tmpq, -1 + shlx tmpq, tmpq, xq + not tmpq + kmovq k7, tmpq ; write mask for U/V + kmovd k1, tmpd ; write mask for 1st half of Y + kmovw k3, tmpd ; read mask for 1st vector + shr tmpq, 16 + kmovw k4, tmpd ; read mask for 2nd vector + shr tmpq, 16 + kmovd k2, tmpd ; write mask for 2nd half of Y + kmovw k5, tmpd ; read mask for 3rd vector + shr tmpd, 16 + kmovw k6, tmpd ; read mask for 4th vector + + vmovdqu32 m2{k3}{z}, [srcq + wtwoq ] + vmovdqu32 m3{k4}{z}, [srcq + wtwoq + mmsize ] + vmovdqu32 m4{k5}{z}, [srcq + wtwoq + mmsize * 2] + vmovdqu32 m5{k6}{z}, [srcq + wtwoq + mmsize * 3] + + ; extract y part 1 + mova m6, m9 + vpermi2b m6, m2, m3 ; UYVY UYVY -> YYYY using permute + vmovdqu16 [ydstq + wq]{k1}, m6 + + ; extract y part 2 + mova m7, m9 + vpermi2b m7, m4, m5 ; UYVY UYVY -> YYYY using permute + vmovdqu16 [ydstq + wq + mmsize]{k2}, m7 + + ; extract uv + pand m2, m1 ; UxVx... + pand m3, m1 ; UxVx... + pand m4, m1 ; UxVx... + pand m5, m1 ; UxVx... + packuswb m2, m3 ; UVUV... + packuswb m4, m5 ; UVUV... + + ; U + pand m6, m2, m1 ; UxUx... + pand m7, m4, m1 ; UxUx... + packuswb m6, m7 ; UUUU + vpermb m6, m8, m6 + vmovdqu8 [udstq + whalfq]{k7}, m6 + + ; V + psrlw m2, 8 ; VxVx... + psrlw m4, 8 ; VxVx... + packuswb m2, m4 ; VVVV + vpermb m2, m8, m2 + vmovdqu8 [vdstq + whalfq]{k7}, m2 + + lea wq, [ wq + 2 * xq] + lea wtwoq, [wtwoq + 4 * xq] + add whalfq, xq +%else .loop_scalar: mov tmpb, [srcq + wtwoq + 0] mov [udstq + whalfq], tmpb @@ -206,6 +288,7 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s add whalfq, 1 sub xq, 2 jg .loop_scalar +%endif ; check if simd loop is need cmp wq, 0 @@ -228,6 +311,17 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s movu m5, [srcq + wtwoq + mmsize * 3] %endif +%if mmsize == 64 + ; extract y part 1 + mova m6, m9 + vpermi2b m6, m2, m3 ; UYVY UYVY -> YYYY using permute + movu [ydstq + wq], m6 + + ; extract y part 2 + mova m7, m9 + vpermi2b m7, m4, m5 ; UYVY UYVY -> YYYY using permute + movu [ydstq + wq + mmsize], m7 +%else ; extract y part 1 RSHIFT_COPY m6, m2, m4, 1, 0x20 ; UYVY UYVY -> YVYU YVY... pand m6, m1; YxYx YxYx... @@ -247,6 +341,7 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s packuswb m6, m7 ; YYYY YYYY... movu [ydstq + wq + mmsize], m6 +%endif ; extract uv pand m2, m1 ; UxVx... @@ -262,6 +357,9 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s pand m7, m4, m1 ; UxUx... packuswb m6, m7 ; UUUU +%if mmsize == 64 + vpermb m6, m8, m6 +%endif movu [udstq + whalfq], m6 @@ -269,6 +367,9 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s psrlw m2, 8 ; VxVx... psrlw m4, 8 ; VxVx... packuswb m2, m4 ; VVVV +%if mmsize == 64 + vpermb m2, m8, m2 +%endif movu [vdstq + whalfq], m2 add whalfq, mmsize @@ -303,4 +404,8 @@ UYVY_TO_YUV422 INIT_YMM avx2 UYVY_TO_YUV422 %endif +%if HAVE_AVX512ICL_EXTERNAL +INIT_ZMM avx512icl +UYVY_TO_YUV422 +%endif %endif